[llvm-branch-commits] [llvm] 8ff2643 - Revert "[CodeGen]Allow targets to use target specific COPY instructions for live range splitting"

Tobias Hieta via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Aug 7 08:29:51 PDT 2023


Author: Vitaly Buka
Date: 2023-08-07T15:48:34+02:00
New Revision: 8ff26437cfd37a3611d3b6066e5aa2cf933887e0

URL: https://github.com/llvm/llvm-project/commit/8ff26437cfd37a3611d3b6066e5aa2cf933887e0
DIFF: https://github.com/llvm/llvm-project/commit/8ff26437cfd37a3611d3b6066e5aa2cf933887e0.diff

LOG: Revert "[CodeGen]Allow targets to use target specific COPY instructions for live range splitting"

And dependent commits.

Details in D150388.

This reverts commit 825b7f0ca5f2211ec3c93139f98d1e24048c225c.
This reverts commit 7a98f084c4d121244ef7286bc6503b6a181d446e.
This reverts commit b4a62b1fa546312d882fa12dfdcd015177d66826.
This reverts commit b7836d856206ec39509d42529f958c920368166b.

No conflicts in the code, few tests had conflicts in autogenerated CHECKs:
llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll

Reviewed By: alexfh

Differential Revision: https://reviews.llvm.org/D156381

(cherry picked from commit a496c8be6e638ae58bb45f13113dbe3a4b7b23fd)

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/lib/CodeGen/CalcSpillWeights.cpp
    llvm/lib/CodeGen/InlineSpiller.cpp
    llvm/lib/CodeGen/LiveRangeEdit.cpp
    llvm/lib/CodeGen/LiveRangeShrink.cpp
    llvm/lib/CodeGen/RegAllocGreedy.cpp
    llvm/lib/CodeGen/SplitKit.cpp
    llvm/lib/CodeGen/SplitKit.h
    llvm/lib/CodeGen/TargetInstrInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
    llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
    llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
    llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
    llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
    llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
    llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
    llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/nested-calls.ll
    llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
    llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
    llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
    llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
    llvm/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
    llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
    llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
    llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
    llvm/test/CodeGen/AMDGPU/spill192.mir
    llvm/test/CodeGen/AMDGPU/spill224.mir
    llvm/test/CodeGen/AMDGPU/spill288.mir
    llvm/test/CodeGen/AMDGPU/spill320.mir
    llvm/test/CodeGen/AMDGPU/spill352.mir
    llvm/test/CodeGen/AMDGPU/spill384.mir
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/swdev380865.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
    llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
    llvm/test/CodeGen/Mips/madd-msub.ll
    llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
    llvm/test/CodeGen/X86/dagcombine-cse.ll
    llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
    llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll

Removed: 
    llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
    llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
    llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
    llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
    llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
    llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
    llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 817d32ea0ef6f7..93dfcfc399247e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1044,16 +1044,6 @@ class TargetInstrInfo : public MCInstrInfo {
     return isCopyInstrImpl(MI);
   }
 
-  bool isFullCopyInstr(const MachineInstr &MI) const {
-    auto DestSrc = isCopyInstr(MI);
-    if (!DestSrc)
-      return false;
-
-    const MachineOperand *DestRegOp = DestSrc->Destination;
-    const MachineOperand *SrcRegOp = DestSrc->Source;
-    return !DestRegOp->getSubReg() && !SrcRegOp->getSubReg();
-  }
-
   /// If the specific machine instruction is an instruction that adds an
   /// immediate value and a physical register, and stores the result in
   /// the given physical register \c Reg, return a pair of the source
@@ -1968,13 +1958,6 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
-  /// Allows targets to use appropriate copy instruction while spilitting live
-  /// range of a register in register allocation.
-  virtual unsigned getLiveRangeSplitOpcode(Register Reg,
-                                           const MachineFunction &MF) const {
-    return TargetOpcode::COPY;
-  }
-
   /// During PHI eleimination lets target to make necessary checks and
   /// insert the copy to the PHI destination register in a target specific
   /// manner.

diff  --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 0377bc00206737..5a005ba7b414d6 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -97,7 +97,7 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
     // Trace copies introduced by live range splitting.  The inline
     // spiller can rematerialize through these copies, so the spill
     // weight must reflect this.
-    while (TII.isFullCopyInstr(*MI)) {
+    while (MI->isFullCopy()) {
       // The copy destination must match the interval register.
       if (MI->getOperand(0).getReg() != Reg)
         return false;
@@ -224,16 +224,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       continue;
 
     NumInstr++;
-    bool identityCopy = false;
-    auto DestSrc = TII.isCopyInstr(*MI);
-    if (DestSrc) {
-      const MachineOperand *DestRegOp = DestSrc->Destination;
-      const MachineOperand *SrcRegOp = DestSrc->Source;
-      identityCopy = DestRegOp->getReg() == SrcRegOp->getReg() &&
-                     DestRegOp->getSubReg() == SrcRegOp->getSubReg();
-    }
-
-    if (identityCopy || MI->isImplicitDef())
+    if (MI->isIdentityCopy() || MI->isImplicitDef())
       continue;
     if (!Visited.insert(MI).second)
       continue;
@@ -267,7 +258,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     }
 
     // Get allocation hints from copies.
-    if (!TII.isCopyInstr(*MI))
+    if (!MI->isCopy())
       continue;
     Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
     if (!HintReg)

diff  --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c62f3db9d32156..277c6be418c54a 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -256,11 +256,11 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
 // This minimizes register pressure and maximizes the store-to-load distance for
 // spill slots which can be important in tight loops.
 
-/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
-/// otherwise return 0.
-static Register isCopyOf(const MachineInstr &MI, Register Reg,
-                         const TargetInstrInfo &TII) {
-  if (!TII.isCopyInstr(MI))
+/// If MI is a COPY to or from Reg, return the other register, otherwise return
+/// 0.
+static Register isCopyOf(const MachineInstr &MI, Register Reg) {
+  assert(!MI.isBundled());
+  if (!MI.isCopy())
     return Register();
 
   const MachineOperand &DstOp = MI.getOperand(0);
@@ -277,10 +277,9 @@ static Register isCopyOf(const MachineInstr &MI, Register Reg,
 }
 
 /// Check for a copy bundle as formed by SplitKit.
-static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg,
-                               const TargetInstrInfo &TII) {
+static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) {
   if (!FirstMI.isBundled())
-    return isCopyOf(FirstMI, Reg, TII);
+    return isCopyOf(FirstMI, Reg);
 
   assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() &&
          "expected to see first instruction in bundle");
@@ -289,12 +288,11 @@ static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg,
   MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator();
   while (I->isBundledWithSucc()) {
     const MachineInstr &MI = *I;
-    auto CopyInst = TII.isCopyInstr(MI);
-    if (!CopyInst)
+    if (!MI.isCopy())
       return Register();
 
-    const MachineOperand &DstOp = *CopyInst->Destination;
-    const MachineOperand &SrcOp = *CopyInst->Source;
+    const MachineOperand &DstOp = MI.getOperand(0);
+    const MachineOperand &SrcOp = MI.getOperand(1);
     if (DstOp.getReg() == Reg) {
       if (!SnipReg)
         SnipReg = SrcOp.getReg();
@@ -360,7 +358,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
     MachineInstr &MI = *RI++;
 
     // Allow copies to/from Reg.
-    if (isCopyOfBundle(MI, Reg, TII))
+    if (isCopyOfBundle(MI, Reg))
       continue;
 
     // Allow stack slot loads.
@@ -398,7 +396,7 @@ void InlineSpiller::collectRegsToSpill() {
     return;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
-    Register SnipReg = isCopyOfBundle(MI, Reg, TII);
+    Register SnipReg = isCopyOfBundle(MI, Reg);
     if (!isSibling(SnipReg))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(SnipReg);
@@ -521,14 +519,14 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     // Find all spills and copies of VNI.
     for (MachineInstr &MI :
          llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) {
-      if (!MI.mayStore() && !TII.isCopyInstr(MI))
+      if (!MI.isCopy() && !MI.mayStore())
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
       if (LI->getVNInfoAt(Idx) != VNI)
         continue;
 
       // Follow sibling copies down the dominator tree.
-      if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) {
+      if (Register DstReg = isCopyOfBundle(MI, Reg)) {
         if (isSibling(DstReg)) {
           LiveInterval &DstLI = LIS.getInterval(DstReg);
           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
@@ -872,7 +870,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   if (Ops.back().first != MI || MI->isBundled())
     return false;
 
-  bool WasCopy = TII.isCopyInstr(*MI).has_value();
+  bool WasCopy = MI->isCopy();
   Register ImpReg;
 
   // TII::foldMemoryOperand will do what we need here for statepoint
@@ -1157,7 +1155,7 @@ void InlineSpiller::spillAroundUses(Register Reg) {
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    Register SibReg = isCopyOfBundle(MI, Reg, TII);
+    Register SibReg = isCopyOfBundle(MI, Reg);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {

diff  --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index ff49e080090c2b..c3477cd8ce34ce 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -352,8 +352,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     // unlikely to change anything. We typically don't want to shrink the
     // PIC base register that has lots of uses everywhere.
     // Always shrink COPY uses that probably come from live range splitting.
-    if ((MI->readsVirtualRegister(Reg) &&
-         (MO.isDef() || TII.isCopyInstr(*MI))) ||
+    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) ||
         (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO))))
       ToShrink.insert(&LI);
     else if (MO.readsReg())

diff  --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4403b8fc..93f5314539cdb1 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -110,7 +109,6 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
@@ -199,7 +197,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           // is because it needs more accurate model to handle register
           // pressure correctly.
           MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
-          if (!TII.isCopyInstr(DefInstr))
+          if (!DefInstr.isCopy())
             NumEligibleUse++;
           Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
         } else {

diff  --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 68f6ea3268a9ae..48187e575494fc 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1282,12 +1282,10 @@ static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
 /// VirtReg.
 static bool readsLaneSubset(const MachineRegisterInfo &MRI,
                             const MachineInstr *MI, const LiveInterval &VirtReg,
-                            const TargetRegisterInfo *TRI, SlotIndex Use,
-                            const TargetInstrInfo *TII) {
+                            const TargetRegisterInfo *TRI, SlotIndex Use) {
   // Early check the common case.
-  auto DestSrc = TII->isCopyInstr(*MI);
-  if (DestSrc &&
-      DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg())
+  if (MI->isCopy() &&
+      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
     return false;
 
   // FIXME: We're only considering uses, but should be consider defs too?
@@ -1346,14 +1344,14 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
   // the allocation.
   for (const SlotIndex Use : Uses) {
     if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
-      if (TII->isFullCopyInstr(*MI) ||
+      if (MI->isFullCopy() ||
           (SplitSubClass &&
            SuperRCNumAllocatableRegs ==
                getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
                                                    TII, TRI, RegClassInfo)) ||
           // TODO: Handle split for subranges with subclass constraints?
           (!SplitSubClass && VirtReg.hasSubRanges() &&
-           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) {
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
@@ -2140,7 +2138,7 @@ void RAGreedy::initializeCSRCost() {
 /// \p Out is not cleared before being populated.
 void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
   for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
-    if (!TII->isFullCopyInstr(Instr))
+    if (!Instr.isFullCopy())
       continue;
     // Look for the other end of the copy.
     Register OtherReg = Instr.getOperand(0).getReg();
@@ -2455,10 +2453,9 @@ RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) {
            MI.getOpcode() == TargetOpcode::STATEPOINT;
   };
   for (MachineInstr &MI : MBB) {
-    auto DestSrc = TII->isCopyInstr(MI);
-    if (DestSrc) {
-      const MachineOperand &Dest = *DestSrc->Destination;
-      const MachineOperand &Src = *DestSrc->Source;
+    if (MI.isCopy()) {
+      const MachineOperand &Dest = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
       Register SrcReg = Src.getReg();
       Register DestReg = Dest.getReg();
       // Only count `COPY`s with a virtual register as source or destination.

diff  --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 83964eced5977e..eee54f09fbad01 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -514,10 +514,10 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
-SlotIndex SplitEditor::buildSingleSubRegCopy(
-    Register FromReg, Register ToReg, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, unsigned SubIdx,
-    LiveInterval &DestLI, bool Late, SlotIndex Def, const MCInstrDesc &Desc) {
+SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
   bool FirstCopy = !Def.isValid();
   MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
       .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
@@ -536,8 +536,7 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(
 SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
-  const MCInstrDesc &Desc =
-      TII.get(TII.getLiveRangeSplitOpcode(FromReg, *MBB.getParent()));
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
   SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
     // The full vreg is copied.
@@ -565,7 +564,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   SlotIndex Def;
   for (unsigned BestIdx : SubIndexes) {
     Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
-                                DestLI, Late, Def, Desc);
+                                DestLI, Late, Def);
   }
 
   BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
@@ -1585,9 +1584,7 @@ bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
   if (BI.LiveIn && BI.LiveOut)
     return true;
   // No point in isolating a copy. It has no register class constraints.
-  MachineInstr *MI = LIS.getInstructionFromIndex(BI.FirstInstr);
-  bool copyLike = TII.isCopyInstr(*MI) || MI->isSubregToReg();
-  if (copyLike)
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
     return false;
   // Finally, don't isolate an end point that was created by earlier splits.
   return isOriginalEndpoint(BI.FirstInstr);

diff  --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 1174e392e4e442..f764ffd4750ccb 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -428,11 +428,8 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
       bool Late, unsigned RegIdx);
 
   SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg,
-                                  MachineBasicBlock &MB,
-                                  MachineBasicBlock::iterator InsertBefore,
-                                  unsigned SubIdx, LiveInterval &DestLI,
-                                  bool Late, SlotIndex Def,
-                                  const MCInstrDesc &Desc);
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
 
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.

diff  --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 09dcddc17b0643..b29404b425190a 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -440,9 +440,8 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
 // If the COPY instruction in MI can be folded to a stack operation, return
 // the register class to use.
 static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
-                                              const TargetInstrInfo &TII,
                                               unsigned FoldIdx) {
-  assert(TII.isCopyInstr(MI) && "MI must be a COPY instruction");
+  assert(MI.isCopy() && "MI must be a COPY instruction");
   if (MI.getNumOperands() != 2)
     return nullptr;
   assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
@@ -631,10 +630,10 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
   }
 
   // Straight COPY may fold as load/store.
-  if (!isCopyInstr(MI) || Ops.size() != 1)
+  if (!MI.isCopy() || Ops.size() != 1)
     return nullptr;
 
-  const TargetRegisterClass *RC = canFoldCopy(MI, *this, Ops[0]);
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
   if (!RC)
     return nullptr;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b82db82de84ef2..c25194c02f72bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -41,7 +41,6 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIOptimizeExecMaskingPreRAPass();
 FunctionPass *createSIOptimizeVGPRLiveRangePass();
 FunctionPass *createSIFixSGPRCopiesPass();
-FunctionPass *createLowerWWMCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
@@ -145,9 +144,6 @@ extern char &SIFixSGPRCopiesID;
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
-void initializeSILowerWWMCopiesPass(PassRegistry &);
-extern char &SILowerWWMCopiesID;
-
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f90c8e4bdddd45..87ef2333e2ea92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -364,7 +364,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUDAGToDAGISelPass(*PR);
   initializeGCNDPPCombinePass(*PR);
   initializeSILowerI1CopiesPass(*PR);
-  initializeSILowerWWMCopiesPass(*PR);
   initializeSILowerSGPRSpillsPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
@@ -1297,7 +1296,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
-  addPass(&SILowerWWMCopiesID);
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
   return true;
@@ -1352,8 +1350,6 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
   addPass(&SILowerSGPRSpillsID);
 
   addPass(createVGPRAllocPass(false));
-
-  addPass(&SILowerWWMCopiesID);
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 97f1fb988d6946..3eab7887897caf 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -145,7 +145,6 @@ add_llvm_target(AMDGPUCodeGen
   SILoadStoreOptimizer.cpp
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
-  SILowerWWMCopies.cpp
   SILowerSGPRSpills.cpp
   SIMachineFunctionInfo.cpp
   SIMachineScheduler.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 865caae240f347..903e726c667d59 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -99,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
           SGPR, PrologEpilogSGPRSaveRestoreInfo(
                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
 
-      LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
-                 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
-                        << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
-                        << '\n';);
+      LLVM_DEBUG(
+          auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
+          dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+                 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
     } else {
       // Remove dead <FI> index
       MF.getFrameInfo().RemoveStackObject(FI);
@@ -264,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {
 
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
+        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -309,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
   void restoreFromVGPRLane(const int FI) {
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
+        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -1353,8 +1353,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
                                                 TRI->isAGPR(MRI, VReg))) {
             assert(RS != nullptr);
-            RS->enterBasicBlockEnd(MBB);
-            RS->backward(MI);
+            // FIXME: change to enterBasicBlockEnd()
+            RS->enterBasicBlock(MBB);
             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
             SpillFIs.set(FI);
             continue;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 278cf2b69ee37d..0f954732a5eef7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2414,14 +2414,6 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   return std::pair(Split[0], Split[1]);
 }
 
-std::optional<DestSourcePair>
-SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
-  if (MI.getOpcode() == AMDGPU::WWM_COPY)
-    return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
-
-  return std::nullopt;
-}
-
 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
                                       MachineOperand &Src0,
                                       unsigned Src0OpName,
@@ -3088,7 +3080,6 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
   case AMDGPU::COPY:
-  case AMDGPU::WWM_COPY:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
   case AMDGPU::V_ACCVGPR_READ_B32_e64:
   case AMDGPU::V_ACCVGPR_MOV_B32:
@@ -4978,8 +4969,7 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         const DebugLoc &DL, Register Reg,
-                                        bool IsSCCLive,
-                                        SlotIndexes *Indexes) const {
+                                        bool IsSCCLive) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool IsWave32 = ST.isWave32();
@@ -4989,34 +4979,23 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
-                           .addReg(Exec, RegState::Kill);
-    auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
-    if (Indexes) {
-      Indexes->insertMachineInstrInMaps(*StoreExecMI);
-      Indexes->insertMachineInstrInMaps(*FlipExecMI);
-    }
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
   } else {
     const unsigned OrSaveExec =
         IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
     auto SaveExec =
         BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
-    if (Indexes)
-      Indexes->insertMachineInstrInMaps(*SaveExec);
   }
 }
 
 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              const DebugLoc &DL, Register Reg,
-                              SlotIndexes *Indexes) const {
+                              const DebugLoc &DL, Register Reg) const {
   unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  auto ExecRestoreMI =
-      BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
-  if (Indexes)
-    Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
+  BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
 }
 
 static const TargetRegisterClass *
@@ -8001,16 +7980,6 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
-unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
-                                              const MachineFunction &MF) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  assert(SrcReg.isVirtual());
-  if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
-    return AMDGPU::WWM_COPY;
-
-  return AMDGPU::COPY;
-}
-
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
          MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -8562,7 +8531,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
   // A similar issue also exists with spilling and reloading $exec registers.
   //
   // To prevent that, constrain the %0 register class here.
-  if (isFullCopyInstr(MI)) {
+  if (MI.isFullCopy()) {
     Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
@@ -8659,7 +8628,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
   if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
     return InstructionUniformity::AlwaysUniform;
 
-  if (isCopyInstr(MI)) {
+  if (MI.isCopy()) {
     const MachineOperand &srcOp = MI.getOperand(1);
     if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
       const TargetRegisterClass *regClass =

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b25aae7b2fb043..66f93e5640d634 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -170,12 +170,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
 protected:
-  /// If the specific machine instruction is a instruction that moves/copies
-  /// value from one register to another register return destination and source
-  /// registers as machine operands.
-  std::optional<DestSourcePair>
-  isCopyInstrImpl(const MachineInstr &MI) const override;
-
   bool swapSourceModifiers(MachineInstr &MI,
                            MachineOperand &Src0, unsigned Src0OpName,
                            MachineOperand &Src1, unsigned Src1OpName) const;
@@ -833,7 +827,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   bool isVGPRCopy(const MachineInstr &MI) const {
-    assert(isCopyInstr(MI));
+    assert(MI.isCopy());
     Register Dest = MI.getOperand(0).getReg();
     const MachineFunction &MF = *MI.getParent()->getParent();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -903,7 +897,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     if (OpIdx >= MI.getDesc().NumOperands)
       return false;
 
-    if (isCopyInstr(MI)) {
+    if (MI.isCopy()) {
       unsigned Size = getOpSize(MI, OpIdx);
       assert(Size == 8 || Size == 4);
 
@@ -952,12 +946,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MBBI,
-                             const DebugLoc &DL, Register Reg, bool IsSCCLive,
-                             SlotIndexes *Indexes = nullptr) const;
+                             const DebugLoc &DL, Register Reg,
+                             bool IsSCCLive) const;
 
   void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                   Register Reg, SlotIndexes *Indexes = nullptr) const;
+                   Register Reg) const;
 
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
@@ -1149,9 +1143,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
                                  const ScheduleDAGMI *DAG) const override;
 
-  unsigned getLiveRangeSplitOpcode(Register Reg,
-                                   const MachineFunction &MF) const override;
-
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7fe76b4c13ca40..2edebccef7d8d1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -172,13 +172,6 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
-def WWM_COPY : SPseudoInstSI <
-  (outs unknown:$dst), (ins unknown:$src)> {
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
-  let isConvergent = 1;
-}
-
 def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
   let Uses = [EXEC];
   let Defs = [EXEC, SCC];

diff  --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 47d28d5d0eab59..d21107c02ef77c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -50,9 +50,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
   SILowerSGPRSpills() : MachineFunctionPass(ID) {}
 
   void calculateSaveRestoreBlocks(MachineFunction &MF);
-  bool spillCalleeSavedRegs(MachineFunction &MF,
-                            SmallVectorImpl<int> &CalleeSavedFIs);
-  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
+  bool spillCalleeSavedRegs(MachineFunction &MF);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -60,13 +58,6 @@ class SILowerSGPRSpills : public MachineFunctionPass {
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
-
-  MachineFunctionProperties getClearedProperties() const override {
-    // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
-    return MachineFunctionProperties()
-        .set(MachineFunctionProperties::Property::IsSSA)
-        .set(MachineFunctionProperties::Property::NoVRegs);
-  }
 };
 
 } // end anonymous namespace
@@ -206,8 +197,7 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
   EntryBB.sortUniqueLiveIns();
 }
 
-bool SILowerSGPRSpills::spillCalleeSavedRegs(
-    MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
+bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -238,7 +228,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
                                            TRI->getSpillAlign(*RC), true);
 
         CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
-        CalleeSavedFIs.push_back(JunkFI);
       }
     }
 
@@ -259,50 +248,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
   return false;
 }
 
-void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
-                                                 LiveIntervals *LIS) {
-  // TODO: This is a workaround to avoid the unmodelled liveness computed with
-  // whole-wave virtual registers when allocated together with the regular VGPR
-  // virtual registers. Presently, the liveness computed during the regalloc is
-  // only uniform (or single lane aware) and it doesn't take account of the
-  // divergent control flow that exists for our GPUs. Since the WWM registers
-  // can modify inactive lanes, the wave-aware liveness should be computed for
-  // the virtual registers to accurately plot their interferences. Without
-  // having the divergent CFG for the function, it is 
diff icult to implement the
-  // wave-aware liveness info. Until then, we conservatively extend the liveness
-  // of the wwm registers into the entire function so that they won't be reused
-  // without first spilling/splitting their liveranges.
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
-  for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-    for (MachineBasicBlock *SaveBlock : SaveBlocks) {
-      MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
-      auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
-                         TII->get(AMDGPU::IMPLICIT_DEF), Reg);
-      MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
-      if (LIS) {
-        LIS->InsertMachineInstrInMaps(*MIB);
-      }
-    }
-  }
-
-  // Insert the KILL in the return blocks to extend their liveness untill the
-  // end of function. Insert a separate KILL for each VGPR.
-  for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
-    MachineBasicBlock::iterator InsertBefore =
-        RestoreBlock->getFirstTerminator();
-    for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-      auto MIB =
-          BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
-                  TII->get(TargetOpcode::KILL));
-      MIB.addReg(Reg);
-      if (LIS)
-        LIS->InsertMachineInstrInMaps(*MIB);
-    }
-  }
-}
-
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -316,8 +261,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   // First, expose any CSR SGPR spills. This is mostly the same as what PEI
   // does, but somewhat simpler.
   calculateSaveRestoreBlocks(MF);
-  SmallVector<int> CalleeSavedFIs;
-  bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
+  bool HasCSRs = spillCalleeSavedRegs(MF);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -331,7 +275,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   bool NewReservedRegs = false;
-  bool SpilledToVirtVGPRLanes = false;
 
   // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
   // handled as SpilledToReg in regular PrologEpilogInserter.
@@ -354,53 +297,23 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
         int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
         assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
-
-        bool IsCalleeSaveSGPRSpill =
-            std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
-            CalleeSavedFIs.end();
-        if (IsCalleeSaveSGPRSpill) {
-          // Spill callee-saved SGPRs into physical VGPR lanes.
-
-          // TODO: This is to ensure the CFIs are static for efficient frame
-          // unwinding in the debugger. Spilling them into virtual VGPR lanes
-          // involve regalloc to allocate the physical VGPRs and that might
-          // cause intermediate spill/split of such liveranges for successful
-          // allocation. This would result in broken CFI encoding unless the
-          // regalloc aware CFI generation to insert new CFIs along with the
-          // intermediate spills is implemented. There is no such support
-          // currently exist in the LLVM compiler.
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
-            NewReservedRegs = true;
-            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
-                MI, FI, nullptr, Indexes, LIS, true);
-            if (!Spilled)
-              llvm_unreachable(
-                  "failed to spill SGPR to physical VGPR lane when allocated");
-          }
-        } else {
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
-            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
-                MI, FI, nullptr, Indexes, LIS);
-            if (!Spilled)
-              llvm_unreachable(
-                  "failed to spill SGPR to virtual VGPR lane when allocated");
-            SpillFIs.set(FI);
-            SpilledToVirtVGPRLanes = true;
-          }
+        if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
+          NewReservedRegs = true;
+          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+              MI, FI, nullptr, Indexes, LIS);
+          (void)Spilled;
+          assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          SpillFIs.set(FI);
         }
       }
     }
 
-    if (SpilledToVirtVGPRLanes) {
-      extendWWMVirtRegLiveness(MF, LIS);
-      if (LIS) {
-        // Compute the LiveInterval for the newly created virtual registers.
-        for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
-          LIS->createAndComputeVirtRegInterval(Reg);
-      }
-    }
-
+    // FIXME: Adding to live-ins redundant with reserving registers.
     for (MachineBasicBlock &MBB : MF) {
+      for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
+      MBB.sortUniqueLiveIns();
+
       // FIXME: The dead frame indices are replaced with a null register from
       // the debug value instructions. We should instead, update it with the
       // correct register value. But not sure the register value alone is
@@ -421,10 +334,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     // lane".
     FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
 
-    MadeChange = true;
-  }
-
-  if (SpilledToVirtVGPRLanes) {
     const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
     // Shift back the reserved SGPR for EXEC copy into the lowest range.
     // This SGPR is reserved to handle the whole-wave spill/copy operations
@@ -433,21 +342,20 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
                              TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
       FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
+
+    MadeChange = true;
   } else {
-    // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
-    // spills/copies. Reset the SGPR reserved for EXEC copy.
+    // No SGPR spills and hence there won't be any WWM spills/copies. Reset the
+    // SGPR reserved for EXEC copy.
     FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
   }
 
   SaveBlocks.clear();
   RestoreBlocks.clear();
 
-  // Updated the reserved registers with any physical VGPRs added for SGPR
-  // spills.
-  if (NewReservedRegs) {
-    for (Register Reg : FuncInfo->getWWMReservedRegs())
-      MRI.reserveReg(Reg, TRI);
-  }
+  // Updated the reserved registers with any VGPRs added for SGPR spills.
+  if (NewReservedRegs)
+    MRI.freezeReservedRegs(MF);
 
   return MadeChange;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
deleted file mode 100644
index 9c3cd1bbd6b0fb..00000000000000
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Lowering the WWM_COPY instructions for various register classes.
-/// AMDGPU target generates WWM_COPY instruction to 
diff erentiate WWM
-/// copy from COPY. This pass generates the necessary exec mask manipulation
-/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to
-/// COPY.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-lower-wwm-copies"
-
-namespace {
-
-class SILowerWWMCopies : public MachineFunctionPass {
-public:
-  static char ID;
-
-  SILowerWWMCopies() : MachineFunctionPass(ID) {
-    initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Lower WWM Copies"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  bool isSCCLiveAtMI(const MachineInstr &MI);
-  void addToWWMSpills(MachineFunction &MF, Register Reg);
-
-  LiveIntervals *LIS;
-  SlotIndexes *Indexes;
-  VirtRegMap *VRM;
-  const SIRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  SIMachineFunctionInfo *MFI;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false,
-                    false)
-
-char SILowerWWMCopies::ID = 0;
-
-char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID;
-
-bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) {
-  // We can't determine the liveness info if LIS isn't available. Early return
-  // in that case and always assume SCC is live.
-  if (!LIS)
-    return true;
-
-  LiveRange &LR =
-      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
-  SlotIndex Idx = LIS->getInstructionIndex(MI);
-  return LR.liveAt(Idx);
-}
-
-// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
-// for preserving its entire lanes at function prolog/epilog.
-void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) {
-  if (Reg.isPhysical())
-    return;
-
-  Register PhysReg = VRM->getPhys(Reg);
-  assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
-         "should have allocated a physical register");
-
-  MFI->allocateWWMSpill(MF, PhysReg);
-}
-
-bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  MFI = MF.getInfo<SIMachineFunctionInfo>();
-  LIS = getAnalysisIfAvailable<LiveIntervals>();
-  Indexes = getAnalysisIfAvailable<SlotIndexes>();
-  VRM = getAnalysisIfAvailable<VirtRegMap>();
-  TRI = ST.getRegisterInfo();
-  MRI = &MF.getRegInfo();
-
-  if (!MFI->hasVRegFlags())
-    return false;
-
-  bool Changed = false;
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::WWM_COPY)
-        continue;
-
-      // TODO: Club adjacent WWM ops between same exec save/restore
-      assert(TII->isVGPRCopy(MI));
-
-      // For WWM vector copies, manipulate the exec mask around the copy
-      // instruction.
-      const DebugLoc &DL = MI.getDebugLoc();
-      MachineBasicBlock::iterator InsertPt = MI.getIterator();
-      Register RegForExecCopy = MFI->getSGPRForEXECCopy();
-      TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
-                                 isSCCLiveAtMI(MI), Indexes);
-      TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
-      addToWWMSpills(MF, MI.getOperand(0).getReg());
-      LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
-
-      // Lower WWM_COPY back to COPY
-      MI.setDesc(TII->get(AMDGPU::COPY));
-      Changed |= true;
-    }
-  }
-
-  return Changed;
-}

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 219464eac9ec20..c9376d0ea65319 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -314,23 +314,37 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
-bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
-    MachineFunction &MF, int FI, unsigned LaneIndex) {
+bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
+                                                      int FI,
+                                                      unsigned LaneIndex) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LaneVGPR;
   if (!LaneIndex) {
-    LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    if (LaneVGPR == AMDGPU::NoRegister) {
+      // We have no VGPRs left for spilling SGPRs. Reset because we will not
+      // partially spill the SGPR to VGPRs.
+      SGPRSpillToVGPRLanes.erase(FI);
+      return false;
+    }
+
     SpillVGPRs.push_back(LaneVGPR);
+    // Add this register as live-in to all blocks to avoid machine verifier
+    // complaining about use of an undefined physical register.
+    for (MachineBasicBlock &BB : MF)
+      BB.addLiveIn(LaneVGPR);
   } else {
     LaneVGPR = SpillVGPRs.back();
   }
 
-  SGPRSpillsToVirtualVGPRLanes[FI].push_back(
+  SGPRSpillToVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
 
-bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
+bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
     MachineFunction &MF, int FI, unsigned LaneIndex) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -341,21 +355,16 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
     if (LaneVGPR == AMDGPU::NoRegister) {
       // We have no VGPRs left for spilling SGPRs. Reset because we will not
       // partially spill the SGPR to VGPRs.
-      SGPRSpillsToPhysicalVGPRLanes.erase(FI);
+      PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
       return false;
     }
 
     allocateWWMSpill(MF, LaneVGPR);
-    reserveWWMRegister(LaneVGPR);
-    for (MachineBasicBlock &MBB : MF) {
-      MBB.addLiveIn(LaneVGPR);
-      MBB.sortUniqueLiveIns();
-    }
   } else {
-    LaneVGPR = WWMReservedRegs.back();
+    LaneVGPR = WWMSpills.back().first;
   }
 
-  SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
+  PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
@@ -364,8 +373,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
                                                         int FI,
                                                         bool IsPrologEpilog) {
   std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
-      IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
-                     : SGPRSpillsToVirtualVGPRLanes[FI];
+      IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
+                     : SGPRSpillToVGPRLanes[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -386,14 +395,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
          "not spilling SGPRs to VGPRs");
 
   unsigned &NumSpillLanes =
-      IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
+      IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
 
   for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
     unsigned LaneIndex = (NumSpillLanes % WaveSize);
 
-    bool Allocated = IsPrologEpilog
-                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
-                         : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
+    bool Allocated =
+        IsPrologEpilog
+            ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
+            : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
     if (!Allocated) {
       NumSpillLanes -= I;
       return false;
@@ -474,25 +484,16 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 
 bool SIMachineFunctionInfo::removeDeadFrameIndices(
     MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
-  // Remove dead frame indices from function frame, however keep FP & BP since
-  // spills for them haven't been inserted yet. And also make sure to remove the
-  // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
-  // otherwise, it could result in an unexpected side effect and bug, in case of
-  // any re-mapping of freed frame indices by later pass(es) like "stack slot
+  // Remove dead frame indices from function frame. And also make sure to remove
+  // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
+  // could result in an unexpected side effect and bug, in case of any
+  // re-mapping of freed frame indices by later pass(es) like "stack slot
   // coloring".
-  for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
+  for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
     MFI.RemoveStackObject(R.first);
-    SGPRSpillsToVirtualVGPRLanes.erase(R.first);
+    SGPRSpillToVGPRLanes.erase(R.first);
   }
 
-  // Remove the dead frame indices of CSR SGPRs which are spilled to physical
-  // VGPR lanes during SILowerSGPRSpills pass.
-  if (!ResetSGPRSpillStackIDs) {
-    for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
-      MFI.RemoveStackObject(R.first);
-      SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
-    }
-  }
   bool HaveSGPRToMemory = false;
 
   if (ResetSGPRSpillStackIDs) {

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 37572d30dff6d0..3b4747adf12544 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -496,16 +496,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   };
 
 private:
-  // To track virtual VGPR + lane index for each subregister of the SGPR spilled
-  // to frameindex key during SILowerSGPRSpills pass.
+  // To track VGPR + lane index for each subregister of the SGPR spilled to
+  // frameindex key during SILowerSGPRSpills pass.
+  DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes;
+  // To track VGPR + lane index for spilling special SGPRs like Frame Pointer
+  // identified during PrologEpilogInserter.
   DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
-      SGPRSpillsToVirtualVGPRLanes;
-  // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs
-  // like Frame Pointer identified during PrologEpilogInserter.
-  DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
-      SGPRSpillsToPhysicalVGPRLanes;
-  unsigned NumVirtualVGPRSpillLanes = 0;
-  unsigned NumPhysicalVGPRSpillLanes = 0;
+      PrologEpilogSGPRSpillToVGPRLanes;
+  unsigned NumVGPRSpillLanes = 0;
+  unsigned NumVGPRPrologEpilogSpillLanes = 0;
   SmallVector<Register, 2> SpillVGPRs;
   using WWMSpillsMap = MapVector<Register, int>;
   // To track the registers used in instructions that can potentially modify the
@@ -549,10 +548,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 private:
   Register VGPRForAGPRCopy;
 
-  bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                        unsigned LaneIndex);
-  bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                         unsigned LaneIndex);
+  bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI,
+                                 unsigned LaneIndex);
+  bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI,
+                                             unsigned LaneIndex);
 
 public:
   Register getVGPRForAGPRCopy() const {
@@ -584,9 +583,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   SIModeRegisterDefaults getMode() const { return Mode; }
 
   ArrayRef<SIRegisterInfo::SpilledReg>
-  getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const {
-    auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex);
-    return (I == SGPRSpillsToVirtualVGPRLanes.end())
+  getSGPRSpillToVGPRLanes(int FrameIndex) const {
+    auto I = SGPRSpillToVGPRLanes.find(FrameIndex);
+    return (I == SGPRSpillToVGPRLanes.end())
                ? ArrayRef<SIRegisterInfo::SpilledReg>()
                : ArrayRef(I->second);
   }
@@ -648,9 +647,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   ArrayRef<SIRegisterInfo::SpilledReg>
-  getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const {
-    auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex);
-    return (I == SGPRSpillsToPhysicalVGPRLanes.end())
+  getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const {
+    auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex);
+    return (I == PrologEpilogSGPRSpillToVGPRLanes.end())
                ? ArrayRef<SIRegisterInfo::SpilledReg>()
                : ArrayRef(I->second);
   }
@@ -668,8 +667,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag;
   }
 
-  bool hasVRegFlags() { return VRegFlags.size(); }
-
   void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
                         Align Alignment = Align(4));
 

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 1d50dff4a7d931..c2a2721662418d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -712,6 +712,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
     reserveRegisterTuples(Reserved, Reg);
 
+  for (auto Reg : MFI->getSGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
   return Reserved;
 }
 
@@ -1733,13 +1736,10 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
 
 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
                                RegScavenger *RS, SlotIndexes *Indexes,
-                               LiveIntervals *LIS, bool OnlyToVGPR,
-                               bool SpillToPhysVGPRLane) const {
+                               LiveIntervals *LIS, bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SpilledReg> VGPRSpills =
-      SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
-                          : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -1856,13 +1856,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
 
 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
                                  RegScavenger *RS, SlotIndexes *Indexes,
-                                 LiveIntervals *LIS, bool OnlyToVGPR,
-                                 bool SpillToPhysVGPRLane) const {
+                                 LiveIntervals *LIS, bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SpilledReg> VGPRSpills =
-      SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
-                          : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -2008,7 +2005,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
 /// handled.
 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
     MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
-    SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
+    SlotIndexes *Indexes, LiveIntervals *LIS) const {
   switch (MI->getOpcode()) {
   case AMDGPU::SI_SPILL_S1024_SAVE:
   case AMDGPU::SI_SPILL_S512_SAVE:
@@ -2024,7 +2021,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S32_SAVE:
-    return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
+    return spillSGPR(MI, FI, RS, Indexes, LIS, true);
   case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S384_RESTORE:
@@ -2039,7 +2036,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S96_RESTORE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
-    return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
+    return restoreSGPR(MI, FI, RS, Indexes, LIS, true);
   default:
     llvm_unreachable("not an SGPR spill instruction");
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 17fce43891c52b..2120b47c581e05 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -142,17 +142,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
                                bool IsLoad, bool IsKill = true) const;
 
-  /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a
-  /// free VGPR lane to spill.
+  /// If \p OnlyToVGPR is true, this will only succeed if this
   bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
                  SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-                 bool OnlyToVGPR = false,
-                 bool SpillToPhysVGPRLane = false) const;
+                 bool OnlyToVGPR = false) const;
 
   bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
                    SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-                   bool OnlyToVGPR = false,
-                   bool SpillToPhysVGPRLane = false) const;
+                   bool OnlyToVGPR = false) const;
 
   bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
                           MachineBasicBlock &RestoreMBB, Register SGPR,
@@ -166,10 +163,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  bool eliminateSGPRToVGPRSpillFrameIndex(
-      MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
-      SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-      bool SpillToPhysVGPRLane = false) const;
+  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
+                                          int FI, RegScavenger *RS,
+                                          SlotIndexes *Indexes = nullptr,
+                                          LiveIntervals *LIS = nullptr) const;
 
   StringRef getRegAsmName(MCRegister Reg) const override;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 604caf572b0fe8..53ac78d2b6caa2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -11,12 +11,13 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 2
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s16, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, ext at rel32@lo+4
@@ -27,9 +28,10 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 0
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6b..48c1ef46c19761 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -228,17 +228,18 @@ define void @func_caller_stack() {
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
-; MUBUF-NEXT:    v_writelane_b32 v40, s4, 2
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
+; MUBUF-NEXT:    v_writelane_b32 v41, s4, 0
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
@@ -247,9 +248,10 @@ define void @func_caller_stack() {
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
-; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s4, v41, 0
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
 ; MUBUF-NEXT:    s_mov_b32 s33, s4
@@ -263,9 +265,10 @@ define void @func_caller_stack() {
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 0
 ; FLATSCR-NEXT:    s_add_u32 s0, s32, 4
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
@@ -286,9 +289,10 @@ define void @func_caller_stack() {
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
-; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s0, v41, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s0
@@ -306,12 +310,13 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s4, 2
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_writelane_b32 v41, s4, 0
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
@@ -372,9 +377,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
-; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s4, v41, 0
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
 ; MUBUF-NEXT:    s_mov_b32 s33, s4
@@ -388,11 +394,12 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
 ; FLATSCR-NEXT:    v_add_u32_e32 v3, 8, v0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 0
 ; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    s_add_u32 s2, s32, 56
@@ -437,9 +444,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
-; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s0, v41, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index 1d1ff4251c53a8..da72b8f6132651 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -12,39 +12,32 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
-; CHECK-NEXT:    ; implicit-def: $vgpr8
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v15, v1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v14, v2
-; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v13, v3
-; CHECK-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v12, v4
-; CHECK-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v5
-; CHECK-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v6
-; CHECK-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v7
-; CHECK-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; CHECK-NEXT:    v_mov_b32_e32 v2, v15
-; CHECK-NEXT:    v_mov_b32_e32 v3, v14
-; CHECK-NEXT:    v_mov_b32_e32 v4, v13
-; CHECK-NEXT:    v_mov_b32_e32 v5, v12
-; CHECK-NEXT:    v_mov_b32_e32 v6, v11
-; CHECK-NEXT:    v_mov_b32_e32 v7, v10
-; CHECK-NEXT:    v_mov_b32_e32 v8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v1, v15
+; CHECK-NEXT:    v_mov_b32_e32 v2, v14
+; CHECK-NEXT:    v_mov_b32_e32 v3, v13
+; CHECK-NEXT:    v_mov_b32_e32 v4, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, v11
+; CHECK-NEXT:    v_mov_b32_e32 v6, v10
+; CHECK-NEXT:    v_mov_b32_e32 v7, v9
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -52,40 +45,34 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s8
 ; CHECK-NEXT:    s_mov_b32 s5, s8
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 3
+; CHECK-NEXT:    v_writelane_b32 v8, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v8, s5, 1
+; CHECK-NEXT:    v_writelane_b32 v8, s6, 2
+; CHECK-NEXT:    v_writelane_b32 v8, s7, 3
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s6
 ; CHECK-NEXT:    s_mov_b32 s5, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 4
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
+; CHECK-NEXT:    v_writelane_b32 v8, s4, 4
 ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
@@ -93,16 +80,15 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readfirstlane_b32 s12, v8
-; CHECK-NEXT:    v_readfirstlane_b32 s10, v7
-; CHECK-NEXT:    v_readfirstlane_b32 s9, v6
-; CHECK-NEXT:    v_readfirstlane_b32 s8, v5
-; CHECK-NEXT:    v_readfirstlane_b32 s7, v4
-; CHECK-NEXT:    v_readfirstlane_b32 s6, v3
-; CHECK-NEXT:    v_readfirstlane_b32 s5, v2
-; CHECK-NEXT:    v_readfirstlane_b32 s4, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
+; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
+; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
+; CHECK-NEXT:    v_readfirstlane_b32 s8, v4
+; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v2
+; CHECK-NEXT:    v_readfirstlane_b32 s5, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s4, v0
 ; CHECK-NEXT:    ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
 ; CHECK-NEXT:    s_mov_b32 s13, s10
 ; CHECK-NEXT:    s_mov_b32 s14, s9
@@ -111,84 +97,68 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_mov_b32 s17, s6
 ; CHECK-NEXT:    s_mov_b32 s18, s5
 ; CHECK-NEXT:    s_mov_b32 s19, s4
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s16, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s17, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s18, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s19, 12
-; CHECK-NEXT:    v_mov_b32_e32 v7, v9
-; CHECK-NEXT:    v_mov_b32_e32 v8, v10
-; CHECK-NEXT:    v_mov_b32_e32 v5, v11
-; CHECK-NEXT:    v_mov_b32_e32 v6, v12
-; CHECK-NEXT:    v_mov_b32_e32 v3, v13
-; CHECK-NEXT:    v_mov_b32_e32 v4, v14
-; CHECK-NEXT:    v_mov_b32_e32 v1, v15
-; CHECK-NEXT:    v_mov_b32_e32 v2, v16
+; CHECK-NEXT:    v_writelane_b32 v8, s12, 5
+; CHECK-NEXT:    v_writelane_b32 v8, s13, 6
+; CHECK-NEXT:    v_writelane_b32 v8, s14, 7
+; CHECK-NEXT:    v_writelane_b32 v8, s15, 8
+; CHECK-NEXT:    v_writelane_b32 v8, s16, 9
+; CHECK-NEXT:    v_writelane_b32 v8, s17, 10
+; CHECK-NEXT:    v_writelane_b32 v8, s18, 11
+; CHECK-NEXT:    v_writelane_b32 v8, s19, 12
+; CHECK-NEXT:    v_mov_b32_e32 v6, v9
+; CHECK-NEXT:    v_mov_b32_e32 v7, v10
+; CHECK-NEXT:    v_mov_b32_e32 v4, v11
+; CHECK-NEXT:    v_mov_b32_e32 v5, v12
+; CHECK-NEXT:    v_mov_b32_e32 v2, v13
+; CHECK-NEXT:    v_mov_b32_e32 v3, v14
+; CHECK-NEXT:    v_mov_b32_e32 v0, v15
+; CHECK-NEXT:    v_mov_b32_e32 v1, v16
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[12:13]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[18:19]
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[4:5], v[7:8]
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[5:6]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[4:5]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[8:9], v[3:4]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[8:9], v[2:3]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, s4
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 13
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
+; CHECK-NEXT:    v_writelane_b32 v8, s4, 13
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s4, v8, 13
+; CHECK-NEXT:    v_readlane_b32 s8, v8, 5
+; CHECK-NEXT:    v_readlane_b32 s9, v8, 6
+; CHECK-NEXT:    v_readlane_b32 s10, v8, 7
+; CHECK-NEXT:    v_readlane_b32 s11, v8, 8
+; CHECK-NEXT:    v_readlane_b32 s12, v8, 9
+; CHECK-NEXT:    v_readlane_b32 s13, v8, 10
+; CHECK-NEXT:    v_readlane_b32 s14, v8, 11
+; CHECK-NEXT:    v_readlane_b32 s15, v8, 12
+; CHECK-NEXT:    v_readlane_b32 s16, v8, 0
+; CHECK-NEXT:    v_readlane_b32 s17, v8, 1
+; CHECK-NEXT:    v_readlane_b32 s18, v8, 2
+; CHECK-NEXT:    v_readlane_b32 s19, v8, 3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s4, v2, 13
-; CHECK-NEXT:    v_readlane_b32 s8, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s9, v2, 6
-; CHECK-NEXT:    v_readlane_b32 s10, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s11, v2, 8
-; CHECK-NEXT:    v_readlane_b32 s12, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s13, v2, 10
-; CHECK-NEXT:    v_readlane_b32 s14, v2, 11
-; CHECK-NEXT:    v_readlane_b32 s15, v2, 12
-; CHECK-NEXT:    v_readlane_b32 s16, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s17, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s18, v2, 2
-; CHECK-NEXT:    v_readlane_b32 s19, v2, 3
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s4, v8, 4
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ; kill: killed $vgpr4
 ; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2b..2214d690952850 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -236,21 +236,23 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s16, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], 0
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index c7bc584d42d5ad..3e16742e982b0b 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -21,10 +21,11 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_mov_b32 s33, s32
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; FIXEDABI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; FIXEDABI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; FIXEDABI-NEXT:    s_mov_b64 exec, s[18:19]
-; FIXEDABI-NEXT:    v_writelane_b32 v40, s16, 2
 ; FIXEDABI-NEXT:    s_addk_i32 s32, 0x400
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s30, 0
+; FIXEDABI-NEXT:    v_writelane_b32 v41, s16, 0
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_getpc_b64 s[16:17]
 ; FIXEDABI-NEXT:    s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
@@ -32,9 +33,10 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
-; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 2
+; FIXEDABI-NEXT:    v_readlane_b32 s4, v41, 0
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; FIXEDABI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; FIXEDABI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; FIXEDABI-NEXT:    s_mov_b64 exec, s[6:7]
 ; FIXEDABI-NEXT:    s_addk_i32 s32, 0xfc00
 ; FIXEDABI-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 0061c0a3118ad9..824adbecfc3ae8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -178,19 +178,19 @@ attributes #0 = { nounwind }
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:    0x2b{{$}}
+; GCN-NEXT:        .vgpr_count:    0x2c{{$}}
 ; GCN-NEXT:      no_stack_extern_call_many_args:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2c{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2c{{$}}
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x20{{$}}
@@ -200,7 +200,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x28{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x29{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x21{{$}}
@@ -216,16 +216,16 @@ attributes #0 = { nounwind }
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2c{{$}}
 ; GCN-NEXT:      simple_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
-; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x30{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2c{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x28{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT: ...

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 1cf3b6b4232335..700325859151ed 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1371,7 +1371,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1395,14 +1395,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1425,14 +1425,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1455,14 +1455,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1484,14 +1484,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1515,7 +1515,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1528,7 +1528,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v2bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1556,14 +1556,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call_v2bf16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1590,14 +1590,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call_v2bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1619,14 +1619,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1648,14 +1648,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1679,7 +1679,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1692,7 +1692,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v3bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1721,14 +1721,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call_v3bf16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1756,14 +1756,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call_v3bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1789,14 +1789,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call_v3bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1824,14 +1824,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1860,7 +1860,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1873,7 +1873,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v4bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1909,14 +1909,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call_v4bf16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1951,14 +1951,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call_v4bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1991,14 +1991,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call_v4bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2026,14 +2026,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v4bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2063,7 +2063,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2076,7 +2076,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v8bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2128,14 +2128,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call_v8bf16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2186,14 +2186,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call_v8bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2240,14 +2240,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call_v8bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2283,14 +2283,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v8bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2328,7 +2328,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2341,7 +2341,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v16bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2425,14 +2425,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_call_v16bf16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s10, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2515,14 +2515,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s33, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_call_v16bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s8, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2597,14 +2597,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_mov_b32 s33, s8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_call_v16bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2656,14 +2656,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v16bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -2717,7 +2717,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index 6ff2dbabc8ecd5..4d55d4974be74c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -8,12 +8,12 @@
 @alias = hidden alias void (), ptr @aliasee_default
 
 ; ALL-LABEL: {{^}}kernel:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX908-NEXT: .amdhsa_next_free_sgpr 36
+; GFX908: .amdhsa_next_free_vgpr 41
+; GFX908-NEXT: .amdhsa_next_free_sgpr 33
 
-; GFX90A: .amdhsa_next_free_vgpr 65
-; GFX90A-NEXT: .amdhsa_next_free_sgpr 36
-; GFX90A-NEXT: .amdhsa_accum_offset 32
+; GFX90A: .amdhsa_next_free_vgpr 71
+; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
+; GFX90A-NEXT: .amdhsa_accum_offset 44
 define amdgpu_kernel void @kernel() #0 {
 bb:
   call void @alias() #2

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 797376535cde06..f3974844e0be9a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-LABEL: {{^}}kernel0:
 ; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 36
+; CHECK-NEXT: .amdhsa_next_free_sgpr 33
 define amdgpu_kernel void @kernel0() #0 {
 bb:
   call void @alias0() #2

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index 79bb2fb64117c9..5672cbb3f94f56 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -9,8 +9,8 @@
 ; The parent kernel has a higher VGPR usage than the possible callees.
 
 ; CHECK-LABEL: {{^}}kernel1:
-; CHECK: .amdhsa_next_free_vgpr 41
-; CHECK-NEXT: .amdhsa_next_free_sgpr 36
+; CHECK: .amdhsa_next_free_vgpr 42
+; CHECK-NEXT: .amdhsa_next_free_sgpr 33
 define amdgpu_kernel void @kernel1() #0 {
 bb:
   call void asm sideeffect "; clobber v40 ", "~{v40}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 5745dd9fdb67c3..6800cc42eb59cd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-LABEL: {{^}}kernel2:
 ; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 36
+; CHECK-NEXT: .amdhsa_next_free_sgpr 33
 define amdgpu_kernel void @kernel2() #0 {
 bb:
   call void @alias2() #2

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index b922297c493fab..deed0052a9a35f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-LABEL: {{^}}kernel3:
 ; CHECK: .amdhsa_next_free_vgpr 253
-; CHECK-NEXT: .amdhsa_next_free_sgpr 36
+; CHECK-NEXT: .amdhsa_next_free_sgpr 33
 define amdgpu_kernel void @kernel3() #0 {
 bb:
   call void @alias3() #2

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 43d3c58aeb1d07..529a6dce1432ad 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5823,6 +5823,7 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    s_mov_b32 s33, s32
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
 ; VI-NEXT:    s_addk_i32 s32, 0x400
 ; VI-NEXT:    v_mov_b32_e32 v0, 11
@@ -5832,7 +5833,6 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 13
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; VI-NEXT:    v_mov_b32_e32 v0, 14
-; VI-NEXT:    v_writelane_b32 v40, s4, 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; VI-NEXT:    v_mov_b32_e32 v0, 15
 ; VI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -5868,6 +5868,7 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v28, 9
 ; VI-NEXT:    v_mov_b32_e32 v29, 9
 ; VI-NEXT:    v_mov_b32_e32 v30, 10
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -5875,9 +5876,10 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
-; VI-NEXT:    v_readlane_b32 s4, v40, 2
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
 ; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
@@ -5891,6 +5893,7 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    s_mov_b32 s33, s32
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
 ; CI-NEXT:    s_addk_i32 s32, 0x400
 ; CI-NEXT:    v_mov_b32_e32 v0, 11
@@ -5900,7 +5903,6 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 13
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; CI-NEXT:    v_mov_b32_e32 v0, 14
-; CI-NEXT:    v_writelane_b32 v40, s4, 2
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_mov_b32_e32 v0, 15
 ; CI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -5936,6 +5938,7 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v28, 9
 ; CI-NEXT:    v_mov_b32_e32 v29, 9
 ; CI-NEXT:    v_mov_b32_e32 v30, 10
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -5943,9 +5946,10 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
-; CI-NEXT:    v_readlane_b32 s4, v40, 2
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
 ; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
@@ -5959,6 +5963,7 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 11
@@ -5968,7 +5973,6 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6004,6 +6008,7 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -6011,9 +6016,10 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -6026,15 +6032,17 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    scratch_store_b32 off, v4, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0
@@ -6061,9 +6069,11 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -6077,6 +6087,7 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s33, s32
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
 ; HSA-NEXT:    s_addk_i32 s32, 0x400
 ; HSA-NEXT:    v_mov_b32_e32 v0, 11
@@ -6086,7 +6097,6 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v0, 13
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 14
-; HSA-NEXT:    v_writelane_b32 v40, s4, 2
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; HSA-NEXT:    v_mov_b32_e32 v0, 15
 ; HSA-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6122,6 +6132,7 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v28, 9
 ; HSA-NEXT:    v_mov_b32_e32 v29, 9
 ; HSA-NEXT:    v_mov_b32_e32 v30, 10
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -6129,9 +6140,10 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
-; HSA-NEXT:    v_readlane_b32 s4, v40, 2
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
 ; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
@@ -6162,6 +6174,7 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    s_mov_b32 s33, s32
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
 ; VI-NEXT:    s_addk_i32 s32, 0x400
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41300000
@@ -6171,7 +6184,6 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; VI-NEXT:    v_writelane_b32 v40, s4, 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; VI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6207,6 +6219,7 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v28, 0x41100000
 ; VI-NEXT:    v_mov_b32_e32 v29, 0x41100000
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -6214,9 +6227,10 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
-; VI-NEXT:    v_readlane_b32 s4, v40, 2
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
 ; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
@@ -6230,6 +6244,7 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    s_mov_b32 s33, s32
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
 ; CI-NEXT:    s_addk_i32 s32, 0x400
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41300000
@@ -6239,7 +6254,6 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; CI-NEXT:    v_writelane_b32 v40, s4, 2
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; CI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6275,6 +6289,7 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v28, 0x41100000
 ; CI-NEXT:    v_mov_b32_e32 v29, 0x41100000
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -6282,9 +6297,10 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
-; CI-NEXT:    v_readlane_b32 s4, v40, 2
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
 ; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
@@ -6298,6 +6314,7 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
@@ -6307,7 +6324,6 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6343,6 +6359,7 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0x41100000
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0x41100000
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -6350,9 +6367,10 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -6365,17 +6383,19 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41300000
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41400000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41500000
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41600000
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    scratch_store_b32 off, v4, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
@@ -6404,9 +6424,11 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -6420,6 +6442,7 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    s_mov_b32 s33, s32
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
 ; HSA-NEXT:    s_addk_i32 s32, 0x400
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41300000
@@ -6429,7 +6452,6 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; HSA-NEXT:    v_writelane_b32 v40, s4, 2
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; HSA-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6465,6 +6487,7 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v28, 0x41100000
 ; HSA-NEXT:    v_mov_b32_e32 v29, 0x41100000
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -6472,9 +6495,10 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
-; HSA-NEXT:    v_readlane_b32 s4, v40, 2
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
 ; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
@@ -6505,6 +6529,7 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    s_mov_b32 s33, s32
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
 ; VI-NEXT:    s_addk_i32 s32, 0x400
 ; VI-NEXT:    v_mov_b32_e32 v0, 7
@@ -6522,7 +6547,6 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 13
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; VI-NEXT:    v_mov_b32_e32 v0, 14
-; VI-NEXT:    v_writelane_b32 v40, s4, 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; VI-NEXT:    v_mov_b32_e32 v0, 15
 ; VI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6558,6 +6582,7 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v28, 5
 ; VI-NEXT:    v_mov_b32_e32 v29, 5
 ; VI-NEXT:    v_mov_b32_e32 v30, 6
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -6565,9 +6590,10 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
-; VI-NEXT:    v_readlane_b32 s4, v40, 2
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
 ; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
@@ -6581,6 +6607,7 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    s_mov_b32 s33, s32
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
 ; CI-NEXT:    s_addk_i32 s32, 0x400
 ; CI-NEXT:    v_mov_b32_e32 v0, 7
@@ -6598,7 +6625,6 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 13
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; CI-NEXT:    v_mov_b32_e32 v0, 14
-; CI-NEXT:    v_writelane_b32 v40, s4, 2
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; CI-NEXT:    v_mov_b32_e32 v0, 15
 ; CI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6634,6 +6660,7 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v28, 5
 ; CI-NEXT:    v_mov_b32_e32 v29, 5
 ; CI-NEXT:    v_mov_b32_e32 v30, 6
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -6641,9 +6668,10 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
-; CI-NEXT:    v_readlane_b32 s4, v40, 2
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
 ; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
@@ -6657,6 +6685,7 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 7
@@ -6674,7 +6703,6 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6710,6 +6738,7 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -6717,9 +6746,10 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -6732,15 +6762,17 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10
 ; GFX11-NEXT:    v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 13
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6772,9 +6804,11 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -6788,6 +6822,7 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s33, s32
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
 ; HSA-NEXT:    s_addk_i32 s32, 0x400
 ; HSA-NEXT:    v_mov_b32_e32 v0, 7
@@ -6805,7 +6840,6 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v0, 13
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; HSA-NEXT:    v_mov_b32_e32 v0, 14
-; HSA-NEXT:    v_writelane_b32 v40, s4, 2
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; HSA-NEXT:    v_mov_b32_e32 v0, 15
 ; HSA-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6841,6 +6875,7 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v28, 5
 ; HSA-NEXT:    v_mov_b32_e32 v29, 5
 ; HSA-NEXT:    v_mov_b32_e32 v30, 6
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -6848,9 +6883,10 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
-; HSA-NEXT:    v_readlane_b32 s4, v40, 2
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
 ; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
@@ -6877,6 +6913,7 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    s_mov_b32 s33, s32
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
 ; VI-NEXT:    s_addk_i32 s32, 0x400
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
@@ -6894,7 +6931,6 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; VI-NEXT:    v_writelane_b32 v40, s4, 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; VI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -6930,6 +6966,7 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
 ; VI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -6937,9 +6974,10 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
-; VI-NEXT:    v_readlane_b32 s4, v40, 2
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
 ; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
@@ -6953,6 +6991,7 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    s_mov_b32 s33, s32
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
 ; CI-NEXT:    s_addk_i32 s32, 0x400
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
@@ -6970,7 +7009,6 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; CI-NEXT:    v_writelane_b32 v40, s4, 2
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; CI-NEXT:    v_writelane_b32 v40, s30, 0
@@ -7006,6 +7044,7 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
 ; CI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -7013,9 +7052,10 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
-; CI-NEXT:    v_readlane_b32 s4, v40, 2
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
 ; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
@@ -7029,6 +7069,7 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40e00000
@@ -7046,7 +7087,6 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -7082,6 +7122,7 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0x40a00000
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -7089,9 +7130,10 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -7104,9 +7146,10 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40e00000
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41000000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41100000
@@ -7117,6 +7160,7 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41400000
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0
 ; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41600000
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
@@ -7147,9 +7191,11 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7163,6 +7209,7 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    s_mov_b32 s33, s32
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
 ; HSA-NEXT:    s_addk_i32 s32, 0x400
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40e00000
@@ -7180,7 +7227,6 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; HSA-NEXT:    v_writelane_b32 v40, s4, 2
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; HSA-NEXT:    v_writelane_b32 v40, s30, 0
@@ -7216,6 +7262,7 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v28, 0x40a00000
 ; HSA-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -7223,9 +7270,10 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
-; HSA-NEXT:    v_readlane_b32 s4, v40, 2
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
 ; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 58c55e01ec1f91..510279b51fd30b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -15,17 +15,17 @@ define void @use_vcc() #1 {
 
 ; GCN-LABEL: {{^}}indirect_use_vcc:
 ; GCN: s_mov_b32 s4, s33
-; GCN: v_writelane_b32 v40, s4, 2
+; GCN: v_writelane_b32 v41, s4, 0
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
 ; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
-; GCN: v_readlane_b32 s4, v40, 2
+; GCN: v_readlane_b32 s4, v41, 0
 ; GCN: s_mov_b32 s33, s4
 ; GCN: s_setpc_b64 s[30:31]
 ; GCN: ; NumSgprs: 36
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define void @indirect_use_vcc() #1 {
   call void @use_vcc()
   ret void
@@ -36,7 +36,7 @@ define void @indirect_use_vcc() #1 {
 ; CI: ; NumSgprs: 38
 ; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_vcc()
   ret void
@@ -54,7 +54,7 @@ define void @use_flat_scratch() #1 {
 ; GCN-LABEL: {{^}}indirect_use_flat_scratch:
 ; CI: ; NumSgprs: 38
 ; VI: ; NumSgprs: 40
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define void @indirect_use_flat_scratch() #1 {
   call void @use_flat_scratch()
   ret void
@@ -65,7 +65,7 @@ define void @indirect_use_flat_scratch() #1 {
 ; CI: ; NumSgprs: 38
 ; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_flat_scratch()
   ret void
@@ -80,7 +80,7 @@ define void @use_10_vgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_10_vgpr:
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define void @indirect_use_10_vgpr() #0 {
   call void @use_10_vgpr()
   ret void
@@ -88,7 +88,7 @@ define void @indirect_use_10_vgpr() #0 {
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
 ; GCN: is_dynamic_callstack = 0
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
   call void @indirect_use_10_vgpr()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index c7f9ff85806fc6..9b638525e76a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -24,11 +24,13 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
 ; MUBUF:   buffer_store_dword
+; MUBUF:   buffer_store_dword
+; FLATSCR: scratch_store_dword
 ; FLATSCR: scratch_store_dword
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: v_writelane_b32 v40, s34, 2
+; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
 ; GCN: v_writelane_b32 v40, s35, 3
 
 ; GCN: s_swappc_b64
@@ -42,8 +44,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1
 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0
 
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
+; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
 ; MUBUF:   buffer_load_dword
+; MUBUF:   buffer_load_dword
+; FLATSCR: scratch_load_dword
 ; FLATSCR: scratch_load_dword
 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN: s_setpc_b64 s[30:31]
@@ -58,17 +62,21 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
 ; GCN: s_mov_b32 s33, s32
 ; MUBUF:   buffer_store_dword v40
+; MUBUF:   buffer_store_dword v41
 ; FLATSCR: scratch_store_dword off, v40
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
+; FLATSCR: scratch_store_dword off, v41
 ; MUBUF:   s_addk_i32 s32, 0x400
 ; FLATSCR: s_add_i32 s32, s32, 16
+; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
 
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_swappc_b64
 
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
+; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
 ; MUBUF:   buffer_load_dword v40
+; MUBUF:   buffer_load_dword v41
 ; FLATSCR: scratch_load_dword v40
+; FLATSCR: scratch_load_dword v41
 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 define void @test_func_call_external_void_funcx2() #0 {
   call void @external_void_func_void()

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 0cff703a429ddf..5d61ea1a70fbf4 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -89,13 +89,15 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2
 ; MUBUF-DAG:   s_addk_i32 s32, 0x400{{$}}
 ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}}
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30,
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], 0
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
 
 ; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
@@ -106,10 +108,12 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
 ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]]
 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]]
 
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2
+; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], 0
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; MUBUF:    s_addk_i32 s32, 0xfc00{{$}}
 ; FLATSCR:  s_add_i32 s32, s32, -16{{$}}
@@ -136,11 +140,13 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; MUBUF-DAG:   s_addk_i32 s32, 0x400
 ; FLATSCR-DAG: s_add_i32 s32, s32, 16
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]]
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]]
 
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
@@ -149,10 +155,12 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0
 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1
 
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]]
+; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], [[FP_SPILL_LANE]]
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; MUBUF:   s_addk_i32 s32, 0xfc00
 ; FLATSCR: s_add_i32 s32, s32, -16
@@ -623,14 +631,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; Make sure that the FP save happens after restoring exec from the same
 ; register.
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg:
-; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
+; FLATSCR: s_mov_b32 s2, s33
 ; FLATSCR: s_mov_b32 s33, s32
 ; GCN-NOT: v_writelane_b32 v40, s33
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]]
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; FLATSCR: s_or_saveexec_b64 s[4:5], -1
+; FLATSCR: s_mov_b64 exec, s[4:5]
+; FLATSCR: s_or_saveexec_b64 s[4:5], -1
 ; GCN-NOT: v_readlane_b32 s33, v40
-; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
+; FLATSCR: s_mov_b32 s33, s2
 ; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_reg() #1 {
   call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",

diff  --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index c9f9078281da25..c9cd55eb33ea8d 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -30,13 +30,6 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ;
 ; GCN_DBG-LABEL: test_loop:
 ; GCN_DBG:       ; %bb.0: ; %entry
-; GCN_DBG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN_DBG-NEXT:    s_mov_b32 s14, -1
-; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
-; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0
 ; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
@@ -46,25 +39,11 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    s_cmp_lg_u32 s1, s2
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_mov_b64 s[6:7], exec
-; GCN_DBG-NEXT:    s_mov_b64 exec, -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB0_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
@@ -86,16 +65,8 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %n, -1
@@ -138,36 +109,16 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ;
 ; GCN_DBG-LABEL: loop_const_true:
 ; GCN_DBG:       ; %bb.0: ; %entry
-; GCN_DBG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN_DBG-NEXT:    s_mov_b32 s14, -1
-; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
-; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0
 ; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB1_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
@@ -189,9 +140,6 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 entry:
@@ -226,36 +174,16 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ;
 ; GCN_DBG-LABEL: loop_const_false:
 ; GCN_DBG:       ; %bb.0: ; %entry
-; GCN_DBG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN_DBG-NEXT:    s_mov_b32 s14, -1
-; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
-; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0
 ; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB2_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
@@ -277,9 +205,6 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 entry:
@@ -315,36 +240,16 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ;
 ; GCN_DBG-LABEL: loop_const_undef:
 ; GCN_DBG:       ; %bb.0: ; %entry
-; GCN_DBG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN_DBG-NEXT:    s_mov_b32 s14, -1
-; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
-; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0
 ; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB3_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
@@ -364,9 +269,6 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 entry:
@@ -416,13 +318,6 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ;
 ; GCN_DBG-LABEL: loop_arg_0:
 ; GCN_DBG:       ; %bb.0: ; %entry
-; GCN_DBG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN_DBG-NEXT:    s_mov_b32 s14, -1
-; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
-; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0
 ; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
@@ -440,24 +335,11 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s1, 2
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 ; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB4_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 3
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s3, v0, 2
@@ -480,9 +362,6 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 74d62ffc580296..189b0c3af3a31a 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -48,37 +48,24 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 3
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -86,69 +73,54 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v0
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 5
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_add_i32_e64 v2, s[2:3], v2, v0
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s2, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], s2
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:  .LBB0_3: ; %Flow
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB0_4: ; %bb.outer.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v2
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -221,37 +193,24 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 3
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_3
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -259,92 +218,76 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v0
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 5
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_add_i32_e64 v2, s[2:3], v2, v0
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s2, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], s2
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB1_4
 ; GCN-O0-NEXT:  .LBB1_3: ; %Flow
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB1_5
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s2, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s3, v0, 5
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s2, v1, 4
+; GCN-O0-NEXT:    v_readlane_b32 s3, v1, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], v0
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_add_i32_e64 v2, s[2:3], v2, v0
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], v0
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB1_3
 ; GCN-O0-NEXT:  .LBB1_5: ; %bb.outer.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v2
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -430,19 +373,13 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_writelane_b32 v1, s2, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s3, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -453,130 +390,96 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s4, 2
 ; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 3
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_6
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 5
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s2, 4
+; GCN-O0-NEXT:    v_writelane_b32 v1, s3, 5
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_2
 ; GCN-O0-NEXT:    s_branch .LBB2_4
 ; GCN-O0-NEXT:  .LBB2_2: ; %Flow
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 6
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 7
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_add_i32_e64 v2, s[2:3], v2, v0
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s2, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], s2
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_5
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[1:2], v0
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_add_i32_e64 v2, s[2:3], v2, v0
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], v0
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_2
 ; GCN-O0-NEXT:  .LBB2_5: ; %Flow1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB2_6: ; %bb.outer.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v2
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -676,19 +579,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    s_mov_b32 s1, s0
@@ -703,9 +601,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
 ; GCN-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
@@ -714,43 +612,29 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v1, s0
+; GCN-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s2, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s3, 1
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_1
 ; GCN-O0-NEXT:    s_branch .LBB3_4
 ; GCN-O0-NEXT:  .LBB3_1: ; %Flow2
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 3
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -763,20 +647,17 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 5
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_7
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.inner.then
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -785,19 +666,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8
 ; GCN-O0-NEXT:    s_branch .LBB3_7
 ; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -806,23 +682,21 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s5, s0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 6
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 7
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_6
 ; GCN-O0-NEXT:  ; %bb.5: ; %bb.inner.then2
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -831,43 +705,27 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 4
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16
 ; GCN-O0-NEXT:  .LBB3_6: ; %Flow
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB3_1
 ; GCN-O0-NEXT:  .LBB3_7: ; %Flow1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB3_8: ; %bb.outer.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v2
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -931,35 +789,23 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s1, 3
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
@@ -968,24 +814,20 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
-; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v0, 31, v2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GCN-O0-NEXT:    s_mov_b32 s4, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[1:2], v[0:1], s4
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[2:3], s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:  .LBB4_2: ; %bb.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-O0-NEXT:    s_barrier
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1077,127 +919,96 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0:       ; %bb.0: ; %bb
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 3
 ; GCN-O0-NEXT:  .LBB5_1: ; %bb1
 ; GCN-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s9, v0, 3
-; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 5
+; GCN-O0-NEXT:    v_readlane_b32 s8, v1, 2
+; GCN-O0-NEXT:    v_readlane_b32 s9, v1, 3
+; GCN-O0-NEXT:    v_readlane_b32 s6, v1, 0
+; GCN-O0-NEXT:    v_readlane_b32 s7, v1, 1
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 4
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 5
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, s4
+; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v0, s4
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 7
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 6
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 7
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 1
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 3
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 6
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_mov_b32 s6, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v1, s6
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 9
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s6
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 8
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 9
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    s_mov_b32 s8, s4
 ; GCN-O0-NEXT:    s_mov_b32 s9, s4
 ; GCN-O0-NEXT:    s_mov_b32 s10, s4
 ; GCN-O0-NEXT:    s_mov_b32 s11, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 11
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 10
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 11
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb4
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-O0-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-O0-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v1, s4
+; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v0, s4
 ; GCN-O0-NEXT:    s_mov_b32 s8, s4
 ; GCN-O0-NEXT:    s_mov_b32 s9, s4
 ; GCN-O0-NEXT:    s_mov_b32 s10, s4
 ; GCN-O0-NEXT:    s_mov_b32 s11, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 12
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 13
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 12
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 13
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_6
@@ -1215,173 +1026,135 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_mov_b32 s6, s9
 ; GCN-O0-NEXT:    s_mov_b32 s7, s8
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_branch .LBB5_6
 ; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 10
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 11
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 10
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 11
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_branch .LBB5_7
 ; GCN-O0-NEXT:  .LBB5_6: ; %Flow
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 12
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 13
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 12
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 13
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_branch .LBB5_5
 ; GCN-O0-NEXT:  .LBB5_7: ; %bb10
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(3)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 8
-; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 9
+; GCN-O0-NEXT:    v_readlane_b32 s6, v1, 8
+; GCN-O0-NEXT:    v_readlane_b32 s7, v1, 9
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], -1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 15
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 15
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 16
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 17
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 16
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 17
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_9
 ; GCN-O0-NEXT:  ; %bb.8: ; %Flow1
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_xor_b64 s[4:5], exec, -1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 15
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v1, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v1, s5, 15
 ; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v4, 16
-; GCN-O0-NEXT:    v_readlane_b32 s9, v4, 17
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s8, v1, 16
+; GCN-O0-NEXT:    v_readlane_b32 s9, v1, 17
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-O0-NEXT:    v_readlane_b32 s6, v4, 4
-; GCN-O0-NEXT:    v_readlane_b32 s7, v4, 5
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 14
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 15
+; GCN-O0-NEXT:    v_readlane_b32 s6, v1, 4
+; GCN-O0-NEXT:    v_readlane_b32 s7, v1, 5
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 14
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 15
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v4, s8, 0
-; GCN-O0-NEXT:    v_writelane_b32 v4, s9, 1
-; GCN-O0-NEXT:    v_writelane_b32 v4, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v4, s7, 3
+; GCN-O0-NEXT:    v_writelane_b32 v1, s8, 0
+; GCN-O0-NEXT:    v_writelane_b32 v1, s9, 1
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 3
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v4, s6, 18
-; GCN-O0-NEXT:    v_writelane_b32 v4, s7, 19
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_writelane_b32 v1, s6, 18
+; GCN-O0-NEXT:    v_writelane_b32 v1, s7, 19
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.10: ; %bb12
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(3)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 18
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 19
+; GCN-O0-NEXT:    v_readlane_b32 s4, v1, 18
+; GCN-O0-NEXT:    v_readlane_b32 s5, v1, 19
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:  ; %bb.11: ; %bb12
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GCN-O0-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-O0-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-O0-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-O0-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d261c8849701dd..f3b746f23af8ad 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -10,7 +10,7 @@
 
 
 ; GCN-LABEL: {{^}}divergent_if_endif:
-; VGPR: workitem_private_segment_byte_size = 16{{$}}
+; VGPR: workitem_private_segment_byte_size = 12{{$}}
 
 
 ; GCN: {{^}}; %bb.0:
@@ -19,7 +19,7 @@
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}}
 
 ; Spill saved exec
 ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
@@ -82,13 +82,13 @@ endif:
 }
 
 ; GCN-LABEL: {{^}}divergent_loop:
-; VGPR: workitem_private_segment_byte_size = 20{{$}}
+; VGPR: workitem_private_segment_byte_size = 16{{$}}
 
 ; GCN: {{^}}; %bb.0:
 ; GCN-DAG: s_mov_b32 m0, -1
 ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}}
 ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]]
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}}
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@@ -166,7 +166,7 @@ end:
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0
-; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]]
+; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]]
 
 ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
 ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
@@ -175,7 +175,6 @@ end:
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
-; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
@@ -188,7 +187,6 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload
 ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 019b5ab593fe36..a83e98ba7773b3 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -31,10 +31,11 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 2
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v2f32 at rel32@lo+4
@@ -42,9 +43,10 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -67,10 +69,11 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 2
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v3f32 at rel32@lo+4
@@ -78,9 +81,10 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -103,10 +107,11 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 2
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v4f16 at rel32@lo+4
@@ -114,9 +119,10 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -139,10 +145,11 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 2
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, func_struct at rel32@lo+4
@@ -151,9 +158,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index f4d985833eb9ad..c53b48e3b97815 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -20,8 +20,8 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 16
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
@@ -48,6 +48,7 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_writelane_b32 v40, s47, 15
 ; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v42, s16, 0
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
 ; CHECK-NEXT:    s_mov_b32 s42, s15
@@ -91,9 +92,10 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 16
+; CHECK-NEXT:    v_readlane_b32 s4, v42, 0
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
deleted file mode 100644
index f802e1b1e18af5..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
+++ /dev/null
@@ -1,279 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -start-before=si-lower-sgpr-spills -stop-after=virtregrewriter,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
-# Tests to check the conservative lieness extension for the wwm registers during SGPR spill lowering.
-
-# Even though the VGPR can be shared for the wwm-operand (writelane/readlane get inserted for the SGPR spills)
-# and the regular operand (%0), they get 
diff erent registers as we conservatively extend the liveness of the
-# wwm-operands.
----
-name:            test_single_block
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  bb.0:
-    liveins: $sgpr4, $vgpr2_vgpr3
-    ; GCN-LABEL: name: test_single_block
-    ; GCN: liveins: $sgpr4, $vgpr2_vgpr3
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
-    ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: KILL killed renamable $vgpr0
-    ; GCN-NEXT: SI_RETURN
-    SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_NOP 0
-    renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec
-    SI_RETURN
-...
-
-# Due to the presence of wwm-operand in the divergent flow, the regular variable (%0) shouldn't get the same register
-# allocated for the wwm-operand in writelane/readlane when the SGPR spill is lowered.
-
----
-name:            test_if_else
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  ; GCN-LABEL: name: test_if_else
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; GCN-NEXT:   S_BRANCH %bb.1
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
-  ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
-  ; GCN-NEXT:   SI_RETURN
-  bb.0:
-    liveins: $sgpr6, $sgpr10_sgpr11
-    S_BRANCH %bb.1
-  bb.1:
-    liveins: $sgpr6, $sgpr10_sgpr11
-    %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
-    S_CBRANCH_EXECZ %bb.3, implicit $exec
-  bb.2:
-    liveins: $sgpr6, $sgpr10_sgpr11
-    SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_NOP 0
-    renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-    S_BRANCH %bb.3
-  bb.3:
-    liveins: $sgpr10_sgpr11
-    $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
-    S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
-    SI_RETURN
-...
-
-# The wwm-register usage outside the loop should have the interference marked with
-# all the regular virtual registers used in the test. The divergent loop index value (%1)
-# can actually share the same VGPR as the wwm-operand. But since we extend the liveness of
-# the wwm operand, an interference will always exist between them.
-
----
-name:            test_loop
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  ; GCN-LABEL: name: test_loop
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0
-  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
-  ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec
-  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec
-  ; GCN-NEXT:   $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit $scc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.4:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.5:
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr6_sgpr7
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
-  ; GCN-NEXT:   SI_RETURN
-  bb.0:
-    liveins: $sgpr4, $sgpr10_sgpr11
-    %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
-    S_CBRANCH_EXECZ %bb.2, implicit $exec
-  bb.1:
-    liveins: $sgpr4, $sgpr10_sgpr11
-    SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_NOP 0
-    renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-    S_BRANCH %bb.2
-  bb.2:
-    liveins: $sgpr4, $sgpr10_sgpr11
-    S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0
-    $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
-    S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4
-    %1:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
-    S_CBRANCH_EXECZ %bb.3, implicit $exec
-    S_BRANCH %bb.3
-  bb.3:
-    $vcc = V_CMP_EQ_U32_e64 0, %1:vgpr_32, implicit $exec
-    $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-    S_CBRANCH_SCC1 %bb.5, implicit $scc
-  bb.4:
-    liveins: $sgpr6_sgpr7
-    %2:vgpr_32 = V_SUB_U32_e32 1, %1:vgpr_32, implicit $exec
-    %1:vgpr_32 = V_MOV_B32_e32 %2:vgpr_32, implicit $exec
-    S_BRANCH %bb.3
-  bb.5:
-    liveins: $sgpr6_sgpr7
-    $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc
-    SI_RETURN
-...
-
-# There must be one KILL instruction for the wwm-operand in every return block.
-# Due to that, the wwm-register allocated should be 
diff erent from the ones
-# allocated for the regular virtual registers.
-
----
-name:            test_multiple_return_blocks
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  ; GCN-LABEL: name: test_multiple_return_blocks
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr2_vgpr3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
-  ; GCN-NEXT:   SI_RETURN
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr2_vgpr3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
-  ; GCN-NEXT:   SI_RETURN
-  bb.0:
-    liveins: $sgpr4, $vgpr2_vgpr3
-    S_CBRANCH_EXECZ %bb.2, implicit $exec
-  bb.1:
-    liveins: $sgpr4, $vgpr2_vgpr3
-    SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_NOP 0
-    renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec
-    SI_RETURN
-  bb.2:
-    liveins: $vgpr2_vgpr3
-    %1:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr2_vgpr3, %1:vgpr_32, 0, 0, implicit $exec
-    SI_RETURN
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index f440b1f9ee6f7b..4da6f0e446689f 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -16,9 +16,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v42, s16, 2
+; GCN-NEXT:    s_addk_i32 s32, 0x800
+; GCN-NEXT:    v_writelane_b32 v43, s16, 0
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
@@ -38,11 +39,12 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
-; GCN-NEXT:    v_readlane_b32 s4, v42, 2
+; GCN-NEXT:    v_readlane_b32 s4, v43, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 5694a9dc1ffd45..41fea1a6c30a41 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -117,15 +117,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; FLAT_SCR_OPT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; FLAT_SCR_OPT-NEXT:    ; implicit-def: $vgpr0
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s4, exec_lo
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, 3
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s5
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s2, 0
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 4
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s3, 1
-; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, 4
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v0, off, s5
+; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
+; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s4
 ; FLAT_SCR_OPT-NEXT:    s_load_dword vcc_lo, s[0:1], 0x8
 ; FLAT_SCR_OPT-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
@@ -222,31 +228,44 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 4
-; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, exec_lo
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, 3
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 0
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v1, s3
+; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 4
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 0
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s0, v1, 0
 ; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s1, v1, 1
-; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v2, 0
-; FLAT_SCR_OPT-NEXT:    ; kill: killed $vgpr1
-; FLAT_SCR_OPT-NEXT:    global_store_dword v2, v0, s[0:1]
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s3
+; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
+; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s2
+; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v1, 0
+; FLAT_SCR_OPT-NEXT:    global_store_dword v1, v0, s[0:1]
 ; FLAT_SCR_OPT-NEXT:    s_endpgm
 ;
 ; FLAT_SCR_ARCH-LABEL: test:
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; FLAT_SCR_ARCH-NEXT:    ; implicit-def: $vgpr0
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s4, exec_lo
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, 3
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s5
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s2, 0
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 4
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s3, 1
-; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, 4
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v0, off, s5
+; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
+; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s4
 ; FLAT_SCR_ARCH-NEXT:    s_load_dword vcc_lo, s[0:1], 0x8
 ; FLAT_SCR_ARCH-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
@@ -343,17 +362,24 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 4
-; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, exec_lo
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, 3
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 0
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v1, s3
+; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 4
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 0
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s0, v1, 0
 ; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s1, v1, 1
-; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v2, 0
-; FLAT_SCR_ARCH-NEXT:    ; kill: killed $vgpr1
-; FLAT_SCR_ARCH-NEXT:    global_store_dword v2, v0, s[0:1]
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s3
+; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
+; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s2
+; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v1, 0
+; FLAT_SCR_ARCH-NEXT:    global_store_dword v1, v0, s[0:1]
 ; FLAT_SCR_ARCH-NEXT:    s_endpgm
   call void asm sideeffect "", "~{s[0:7]}" ()
   call void asm sideeffect "", "~{s[8:15]}" ()

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
index ba619a659f1b07..aca1351de3ef9f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
@@ -12,13 +12,14 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec_lo
@@ -37,13 +38,14 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec_hi
@@ -62,16 +64,17 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def $exec
     ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
-    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
+    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec
@@ -93,12 +96,13 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo
@@ -116,12 +120,13 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi
@@ -139,15 +144,16 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
-    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
+    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
index 1c7896fcb4f141..e8688d8f55d086 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
@@ -13,13 +13,14 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def $m0
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
@@ -43,12 +44,13 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 29daff566922c6..cd69ffed1cec31 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -14,11 +14,12 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s32
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; SPILL-TO-VGPR-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
-; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s4, 2
 ; SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x400
 ; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s30, 0
 ; SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v41, s4, 0
 ; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s31, 1
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
@@ -28,9 +29,10 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 2
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v41, 0
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; SPILL-TO-VGPR-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; SPILL-TO-VGPR-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
 ; SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xfc00
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 7c9d01db9c2c09..d49af23f9865ac 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -8,7 +8,7 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-LABEL: gfx_func:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s36, s33
+; SDAG-NEXT:    s_mov_b32 s38, s33
 ; SDAG-NEXT:    s_mov_b32 s33, s32
 ; SDAG-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; SDAG-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -81,14 +81,14 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; SDAG-NEXT:    s_mov_b64 exec, s[34:35]
 ; SDAG-NEXT:    s_addk_i32 s32, 0xfc00
-; SDAG-NEXT:    s_mov_b32 s33, s36
+; SDAG-NEXT:    s_mov_b32 s33, s38
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: gfx_func:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s36, s33
+; GISEL-NEXT:    s_mov_b32 s38, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -161,7 +161,7 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s36
+; GISEL-NEXT:    s_mov_b32 s33, s38
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @extern_c_func()

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 439670d65b16d9..02b0cc04d638b3 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -101,11 +101,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1 at rel32@lo+4
@@ -114,9 +115,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -130,23 +132,26 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -160,24 +165,27 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -191,23 +199,26 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -226,12 +237,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_signext at rel32@lo+4
@@ -241,9 +253,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -257,25 +270,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_signext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_signext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -289,25 +305,29 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -321,25 +341,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -359,12 +382,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_zeroext at rel32@lo+4
@@ -374,9 +398,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -390,25 +415,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_zeroext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_zeroext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -422,25 +450,29 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -454,25 +486,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -492,11 +527,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8 at rel32@lo+4
@@ -504,9 +540,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -520,22 +557,25 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -549,23 +589,27 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -579,22 +623,25 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i8 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i8 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -613,12 +660,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_signext at rel32@lo+4
@@ -626,9 +674,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -642,23 +691,26 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_signext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_signext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -672,24 +724,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8_signext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_signext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -703,23 +759,26 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i8_signext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -739,12 +798,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_zeroext at rel32@lo+4
@@ -752,9 +812,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -768,23 +829,26 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_zeroext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_zeroext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -798,24 +862,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8_zeroext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_zeroext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -829,23 +897,26 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i8_zeroext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -865,11 +936,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16 at rel32@lo+4
@@ -877,9 +949,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -893,22 +966,25 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -922,23 +998,27 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -952,22 +1032,25 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -986,12 +1069,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_signext at rel32@lo+4
@@ -999,9 +1083,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1015,23 +1100,26 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_signext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_signext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1045,24 +1133,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16_signext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_signext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1076,23 +1168,26 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i16_signext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1112,12 +1207,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_zeroext at rel32@lo+4
@@ -1125,9 +1221,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1141,23 +1238,26 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_zeroext at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_zeroext at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1171,24 +1271,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16_zeroext at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_zeroext at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1202,23 +1306,26 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i16_zeroext at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1238,11 +1345,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i32 at rel32@lo+4
@@ -1250,9 +1358,10 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1266,22 +1375,25 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1295,23 +1407,27 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1325,22 +1441,25 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1359,12 +1478,13 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i64 at rel32@lo+4
@@ -1372,9 +1492,10 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1388,23 +1509,26 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1418,23 +1542,27 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1448,24 +1576,27 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
@@ -1483,13 +1614,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
@@ -1497,9 +1629,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1513,24 +1646,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1544,25 +1680,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1576,24 +1716,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1613,14 +1756,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
@@ -1628,9 +1772,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1644,25 +1789,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1676,24 +1824,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1707,25 +1859,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1744,15 +1899,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64 at rel32@lo+4
@@ -1760,9 +1916,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1776,26 +1933,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1809,25 +1969,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1841,26 +2005,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -1882,17 +2049,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64 at rel32@lo+4
@@ -1900,9 +2068,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1916,28 +2085,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1951,26 +2123,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1984,28 +2160,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2026,11 +2205,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f16 at rel32@lo+4
@@ -2038,9 +2218,10 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2054,22 +2235,25 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2083,23 +2267,27 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2113,22 +2301,25 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2147,11 +2338,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f32 at rel32@lo+4
@@ -2159,9 +2351,10 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2175,22 +2368,25 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2204,23 +2400,27 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2234,22 +2434,25 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2268,12 +2471,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32 at rel32@lo+4
@@ -2281,9 +2485,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2297,23 +2502,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2327,23 +2535,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2357,23 +2569,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2392,13 +2607,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32 at rel32@lo+4
@@ -2406,9 +2622,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2422,24 +2639,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2453,24 +2673,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2484,24 +2708,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2520,8 +2747,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -2529,6 +2756,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32 at rel32@lo+4
@@ -2536,9 +2764,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2552,26 +2781,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2585,25 +2817,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2617,26 +2853,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, -1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2655,12 +2894,13 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f64 at rel32@lo+4
@@ -2668,9 +2908,10 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2684,23 +2925,26 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2714,23 +2958,27 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2744,23 +2992,26 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2779,14 +3030,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64 at rel32@lo+4
@@ -2794,9 +3046,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2810,25 +3063,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2842,24 +3098,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -2873,25 +3133,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -2910,8 +3173,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2920,6 +3183,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64 at rel32@lo+4
@@ -2927,9 +3191,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -2943,27 +3208,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2977,25 +3245,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3009,27 +3281,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3048,11 +3323,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16 at rel32@lo+4
@@ -3060,9 +3336,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3076,22 +3353,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3105,23 +3385,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3135,22 +3419,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3170,11 +3457,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
@@ -3182,9 +3470,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3198,22 +3487,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3227,23 +3519,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3257,22 +3553,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3292,11 +3591,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
@@ -3304,9 +3604,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3320,22 +3621,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3349,23 +3653,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3379,22 +3687,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3414,12 +3725,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
@@ -3427,9 +3739,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3443,23 +3756,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3473,23 +3789,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3503,23 +3823,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3538,12 +3861,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
@@ -3551,9 +3875,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3567,23 +3892,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3597,24 +3925,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3628,23 +3960,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3663,11 +3998,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
@@ -3675,9 +4011,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3691,22 +4028,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3720,23 +4060,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3750,22 +4094,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3785,12 +4132,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
@@ -3798,9 +4146,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3814,23 +4163,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3844,24 +4196,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3875,23 +4231,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -3910,11 +4269,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16 at rel32@lo+4
@@ -3922,9 +4282,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -3938,22 +4299,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -3967,23 +4331,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -3997,22 +4365,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4032,11 +4403,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
@@ -4044,9 +4416,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4060,22 +4433,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4089,23 +4465,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4119,22 +4499,25 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4154,12 +4537,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
@@ -4167,9 +4551,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4183,23 +4568,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4213,23 +4601,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4243,23 +4635,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4278,13 +4673,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32 at rel32@lo+4
@@ -4292,9 +4688,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4308,24 +4705,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4339,24 +4739,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4370,24 +4774,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4406,14 +4813,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 6
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32 at rel32@lo+4
@@ -4421,9 +4829,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4437,25 +4846,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4469,24 +4881,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4500,25 +4916,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4537,11 +4956,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
@@ -4549,9 +4969,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4565,22 +4986,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4594,23 +5018,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4624,22 +5052,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4659,14 +5090,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
@@ -4674,9 +5106,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4690,25 +5123,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4722,24 +5158,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4753,25 +5193,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4790,8 +5233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -4799,6 +5242,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32 at rel32@lo+4
@@ -4806,9 +5250,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4822,26 +5267,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4855,25 +5303,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4887,26 +5339,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4925,8 +5380,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -4941,9 +5397,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4957,27 +5414,30 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[34:35]
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4991,28 +5451,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
 ; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[0:1] offset:16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5026,27 +5490,30 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5067,8 +5534,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -5079,6 +5546,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
@@ -5086,9 +5554,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5102,19 +5571,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
@@ -5122,9 +5592,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5138,15 +5610,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
@@ -5155,9 +5629,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5171,19 +5647,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 6
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
@@ -5191,9 +5668,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5212,8 +5691,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -5230,9 +5710,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5246,11 +5727,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
@@ -5258,17 +5741,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
 ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
 ; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5282,11 +5766,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v12, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
@@ -5294,18 +5781,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    global_load_b128 v[4:7], v12, s[0:1] offset:16
 ; GFX11-NEXT:    global_load_b128 v[8:11], v12, s[0:1] offset:32
 ; GFX11-NEXT:    global_load_b128 v[12:15], v12, s[0:1] offset:48
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5319,11 +5807,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x3
@@ -5331,17 +5821,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5362,8 +5853,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -5385,9 +5877,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5401,11 +5894,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
@@ -5417,17 +5912,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
 ; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
 ; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5441,11 +5937,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v28, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x7
@@ -5457,18 +5956,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    global_load_b128 v[20:23], v28, s[0:1] offset:80
 ; GFX11-NEXT:    global_load_b128 v[24:27], v28, s[0:1] offset:96
 ; GFX11-NEXT:    global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5482,11 +5982,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x7
@@ -5498,17 +6000,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5529,8 +6032,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    global_load_dword v32, v[0:1], off
@@ -5555,9 +6059,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5571,11 +6076,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5588,19 +6095,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
 ; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
 ; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5614,11 +6122,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v28, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    global_load_b32 v32, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5631,19 +6142,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    global_load_b128 v[20:23], v28, s[0:1] offset:80
 ; GFX11-NEXT:    global_load_b128 v[24:27], v28, s[0:1] offset:96
 ; GFX11-NEXT:    global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5657,11 +6169,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5674,19 +6188,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5708,14 +6223,15 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
@@ -5728,11 +6244,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5744,20 +6261,21 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_i32_func_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_i32_func_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5766,12 +6284,14 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5782,20 +6302,23 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    v_writelane_b32 v43, s0, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    global_store_b32 v[41:42], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5804,11 +6327,13 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v43, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -5820,20 +6345,21 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v43, s0, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -5842,12 +6368,14 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v43, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -5864,8 +6392,9 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -5880,9 +6409,10 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5896,27 +6426,30 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_ubyte v0, v2, s[34:35]
 ; GFX10-NEXT:    global_load_dword v1, v2, s[34:35] offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5930,28 +6463,32 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_u8 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5965,27 +6502,30 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v2, s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_load_dword v1, v2, s[0:1] offset:4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -6006,15 +6546,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
@@ -6022,11 +6563,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6038,29 +6580,32 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6071,29 +6616,33 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s33
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 offset:4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6105,29 +6654,32 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
-; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s33
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6148,17 +6700,18 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
@@ -6168,7 +6721,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -6176,6 +6729,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -6189,36 +6743,39 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -6232,11 +6789,13 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:20
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
@@ -6253,14 +6812,16 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 offset:12
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:20
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -6274,12 +6835,13 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
@@ -6296,14 +6858,16 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:20
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
@@ -6335,8 +6899,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -6369,9 +6934,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -6385,13 +6951,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[34:35]
@@ -6421,9 +6988,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -6437,13 +7006,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
@@ -6470,9 +7041,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -6486,13 +7059,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
@@ -6522,9 +7096,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -6541,7 +7117,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-LABEL: tail_call_byval_align16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s8, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
@@ -6626,14 +7202,14 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_mov_b32 s33, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: tail_call_byval_align16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
@@ -6721,14 +7297,14 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: tail_call_byval_align16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s4, s33
+; GFX11-NEXT:    s_mov_b32 s5, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill
@@ -6811,14 +7387,14 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
-; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_mov_b32 s33, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: tail_call_byval_align16:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill
@@ -6903,7 +7479,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s5
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6921,11 +7497,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_inreg at rel32@lo+4
@@ -6934,9 +7511,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -6950,23 +7528,26 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -6980,24 +7561,27 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7011,23 +7595,26 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7046,12 +7633,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_inreg at rel32@lo+4
@@ -7060,9 +7648,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7076,24 +7665,27 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7107,15 +7699,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7123,9 +7717,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7139,24 +7735,27 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i8_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7175,12 +7774,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_inreg at rel32@lo+4
@@ -7189,9 +7789,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7205,24 +7806,27 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7236,15 +7840,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7252,9 +7858,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7268,24 +7876,27 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7304,12 +7915,13 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 42
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i32_inreg at rel32@lo+4
@@ -7318,9 +7930,10 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7334,24 +7947,27 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 42
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7365,15 +7981,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 42
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 42
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -7381,9 +7999,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7397,24 +8017,27 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7433,14 +8056,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i64_inreg at rel32@lo+4
@@ -7450,9 +8074,10 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7466,15 +8091,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -7484,9 +8110,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7500,15 +8128,17 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i64_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i64_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -7519,9 +8149,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7535,15 +8167,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -7553,9 +8186,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7574,10 +8209,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7595,9 +8231,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7611,12 +8248,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7633,9 +8271,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7649,12 +8289,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7672,9 +8314,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7688,12 +8332,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7710,9 +8355,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7732,8 +8379,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -7744,6 +8391,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
@@ -7755,9 +8403,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7771,15 +8420,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -7795,9 +8445,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7811,15 +8463,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -7836,9 +8490,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -7852,15 +8508,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -7876,9 +8533,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7897,10 +8556,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7924,9 +8584,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -7940,12 +8601,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -7968,9 +8630,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7984,12 +8648,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -8013,9 +8679,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8029,12 +8697,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -8057,9 +8726,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8081,11 +8752,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
@@ -8114,9 +8786,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8130,12 +8803,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -8164,9 +8838,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8180,12 +8856,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -8215,9 +8893,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8231,12 +8911,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -8265,9 +8946,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8288,12 +8971,13 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f16_inreg at rel32@lo+4
@@ -8302,9 +8986,10 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8318,24 +9003,27 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8349,15 +9037,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f16_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -8365,9 +9055,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8381,24 +9073,27 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8417,12 +9112,13 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 4.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f32_inreg at rel32@lo+4
@@ -8431,9 +9127,10 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8447,24 +9144,27 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 4.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8478,15 +9178,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 4.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -8494,9 +9196,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8510,24 +9214,27 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8546,14 +9253,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32_inreg at rel32@lo+4
@@ -8563,9 +9271,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8579,15 +9288,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -8597,9 +9307,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8613,15 +9325,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -8632,9 +9346,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8648,15 +9364,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -8666,9 +9383,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8687,8 +9406,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8697,6 +9416,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32_inreg at rel32@lo+4
@@ -8707,9 +9427,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8723,15 +9444,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8744,9 +9466,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8760,15 +9484,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8782,9 +9508,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8798,15 +9526,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8819,9 +9548,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8840,8 +9571,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8854,6 +9585,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
 ; GFX9-NEXT:    s_mov_b32 s7, -1.0
 ; GFX9-NEXT:    s_mov_b32 s8, 0.5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
@@ -8866,9 +9598,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8882,15 +9615,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8909,9 +9643,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8925,15 +9661,17 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8953,9 +9691,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8969,15 +9709,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -8996,9 +9737,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9017,14 +9760,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f64_inreg at rel32@lo+4
@@ -9034,9 +9778,10 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9050,15 +9795,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -9068,9 +9814,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9084,15 +9832,17 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f64_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f64_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 0x40100000
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -9103,9 +9853,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9119,15 +9871,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -9137,9 +9890,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9158,8 +9913,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9170,6 +9925,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64_inreg at rel32@lo+4
@@ -9181,9 +9937,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9197,15 +9954,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9221,9 +9979,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9237,15 +9997,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9262,9 +10024,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9278,15 +10042,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9302,9 +10067,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9323,8 +10090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9339,6 +10106,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64_inreg at rel32@lo+4
@@ -9352,9 +10120,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9368,15 +10137,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9398,9 +10168,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9414,15 +10186,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9445,9 +10219,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9461,15 +10237,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -9491,9 +10268,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9512,12 +10291,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16_inreg at rel32@lo+4
@@ -9526,9 +10306,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9542,12 +10323,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16_inreg at rel32@hi+12
@@ -9557,9 +10339,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9573,12 +10357,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16_inreg at rel32@hi+12
@@ -9589,9 +10375,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9605,12 +10393,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16_inreg at rel32@hi+12
@@ -9620,9 +10409,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9642,13 +10433,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
@@ -9658,9 +10450,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9674,11 +10467,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
@@ -9691,9 +10485,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9707,11 +10503,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -9725,9 +10523,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9741,11 +10541,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -9758,9 +10559,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9780,13 +10583,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
@@ -9796,9 +10600,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9812,11 +10617,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
@@ -9829,9 +10635,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9845,11 +10653,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -9863,9 +10673,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -9879,11 +10691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -9896,9 +10709,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -9918,14 +10733,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
@@ -9935,9 +10751,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -9951,15 +10768,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -9969,9 +10787,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -9985,15 +10805,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 3
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10004,9 +10826,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10020,15 +10844,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10038,9 +10863,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10059,14 +10886,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX9-NEXT:    s_movk_i32 s5, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
@@ -10076,9 +10904,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10092,15 +10921,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10110,9 +10940,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10126,15 +10958,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10145,9 +10979,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10161,15 +10997,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10179,9 +11016,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10200,13 +11039,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
@@ -10216,9 +11056,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10232,11 +11073,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
@@ -10249,9 +11091,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10265,11 +11109,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -10283,9 +11129,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10299,11 +11147,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -10316,9 +11165,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10338,14 +11189,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40003
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
@@ -10355,9 +11207,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10371,15 +11224,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0x40003
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10389,9 +11243,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10405,15 +11261,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 0x40003
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10424,9 +11282,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10440,15 +11300,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10458,9 +11319,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10479,12 +11342,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16_inreg at rel32@lo+4
@@ -10493,9 +11357,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10509,12 +11374,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16_inreg at rel32@hi+12
@@ -10524,9 +11390,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10540,12 +11408,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16_inreg at rel32@hi+12
@@ -10556,9 +11426,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10572,12 +11444,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16_inreg at rel32@hi+12
@@ -10587,9 +11460,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10609,13 +11484,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
@@ -10625,9 +11501,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10641,11 +11518,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
@@ -10658,9 +11536,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10674,11 +11554,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -10692,9 +11574,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10708,11 +11592,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -10725,9 +11610,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10747,14 +11634,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
@@ -10764,9 +11652,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10780,15 +11669,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10798,9 +11688,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10814,15 +11706,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10833,9 +11727,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10849,15 +11745,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
@@ -10867,9 +11764,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -10888,8 +11787,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -10898,6 +11797,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 3
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_inreg at rel32@lo+4
@@ -10908,9 +11808,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -10924,15 +11825,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -10945,9 +11847,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -10961,15 +11865,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 3
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 4
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -10983,9 +11889,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -10999,15 +11907,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11020,9 +11929,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11041,8 +11952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11053,6 +11964,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
 ; GFX9-NEXT:    s_mov_b32 s7, 6
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg at rel32@lo+4
@@ -11064,9 +11976,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11080,15 +11993,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11104,9 +12018,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -11120,15 +12036,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 3
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 4
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11145,9 +12063,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -11161,15 +12081,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11185,9 +12106,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11206,8 +12129,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11215,6 +12138,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
@@ -11226,9 +12150,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11242,11 +12167,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11263,9 +12189,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -11279,11 +12207,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11301,9 +12231,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -11317,11 +12249,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11338,9 +12271,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11360,8 +12295,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11372,6 +12307,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
@@ -11383,9 +12319,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11399,15 +12336,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11423,9 +12361,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -11439,15 +12379,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11464,9 +12406,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -11480,15 +12424,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11504,9 +12449,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11525,8 +12472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11539,6 +12486,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32_inreg at rel32@lo+4
@@ -11551,9 +12499,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11567,15 +12516,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11594,9 +12544,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -11610,15 +12562,17 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11638,9 +12592,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -11654,15 +12610,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11681,9 +12638,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11702,10 +12661,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11732,9 +12692,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11748,12 +12709,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11779,9 +12741,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -11795,12 +12759,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11827,9 +12793,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -11843,12 +12811,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -11874,9 +12843,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -11897,8 +12868,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11917,6 +12888,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s9, 6
 ; GFX9-NEXT:    s_mov_b32 s10, 7
 ; GFX9-NEXT:    s_mov_b32 s11, 8
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
@@ -11932,9 +12904,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -11948,15 +12921,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
@@ -11984,9 +12958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -12000,15 +12976,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -12037,9 +13015,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -12053,15 +13033,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -12089,9 +13070,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -12110,8 +13093,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -12122,6 +13105,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
@@ -12156,9 +13140,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -12172,12 +13157,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12219,9 +13205,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -12235,12 +13223,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12283,9 +13273,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -12299,12 +13291,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12346,9 +13339,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -12369,8 +13364,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 28
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -12386,6 +13381,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
@@ -12459,9 +13455,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -12475,12 +13472,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12567,9 +13565,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -12583,14 +13583,16 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s2, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12671,9 +13673,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -12687,13 +13691,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -12776,9 +13781,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -12799,8 +13806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 28
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
@@ -12815,6 +13822,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
@@ -12894,9 +13902,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -12910,12 +13919,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
@@ -13007,9 +14017,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -13023,12 +14035,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -13114,9 +14128,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -13130,12 +14146,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -13225,9 +14242,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -13249,12 +14268,13 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, stack_passed_f64_arg at rel32@lo+4
@@ -13266,11 +14286,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13282,13 +14303,15 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, stack_passed_f64_arg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, stack_passed_f64_arg at rel32@hi+12
@@ -13296,17 +14319,18 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13317,26 +14341,30 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13348,27 +14376,30 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13385,6 +14416,7 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 12
@@ -13392,7 +14424,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -13429,6 +14460,7 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_12xv3i32 at rel32@lo+4
@@ -13436,9 +14468,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -13452,9 +14485,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 12
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 14
@@ -13497,6 +14530,7 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 11
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_12xv3i32 at rel32@lo+4
@@ -13504,9 +14538,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -13520,9 +14556,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -13544,6 +14581,7 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9
 ; GFX11-NEXT:    v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9
 ; GFX11-NEXT:    v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
@@ -13552,9 +14590,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -13568,9 +14608,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
@@ -13610,6 +14650,7 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
@@ -13617,9 +14658,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -13651,6 +14694,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
@@ -13666,7 +14710,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -13703,6 +14746,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5i32 at rel32@lo+4
@@ -13710,9 +14754,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -13726,20 +14771,20 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_mov_b32_e32 v3, 14
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 11
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 12
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 13
-; GFX10-NEXT:    v_mov_b32_e32 v3, 14
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
@@ -13779,6 +14824,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 7
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5i32 at rel32@lo+4
@@ -13786,9 +14832,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -13802,16 +14850,18 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
@@ -13838,9 +14888,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -13854,9 +14906,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 10
@@ -13866,8 +14918,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 14
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 15
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
@@ -13909,9 +14962,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -13939,6 +14994,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41000000
@@ -13954,7 +15010,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
@@ -13991,6 +15046,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5f32 at rel32@lo+4
@@ -13998,9 +15054,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -14014,20 +15071,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41100000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41200000
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41300000
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41400000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41500000
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x41700000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
@@ -14067,6 +15124,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5f32 at rel32@lo+4
@@ -14074,9 +15132,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -14090,9 +15150,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41000000
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41100000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41200000
@@ -14102,8 +15163,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41600000
 ; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41700000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
@@ -14132,9 +15194,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -14148,9 +15212,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41000000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41100000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41200000
@@ -14160,8 +15224,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41600000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41700000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
@@ -14203,9 +15268,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 09c57543ecd291..fbb893220a8e8e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -13,12 +13,13 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -31,9 +32,10 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -47,11 +49,12 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -66,9 +69,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -82,11 +87,13 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    s_getpc_b64 s[4:5]
 ; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -102,9 +109,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -174,9 +183,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-LABEL: void_func_void_clobber_s28_s29:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v0, s28, 0
 ; GFX11-NEXT:    v_writelane_b32 v0, s29, 1
 ; GFX11-NEXT:    v_writelane_b32 v0, s30, 2
@@ -192,9 +201,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v0, 2
 ; GFX11-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX11-NEXT:    v_readlane_b32 s28, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
@@ -210,11 +219,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s31
@@ -231,9 +241,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -247,14 +258,15 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -269,9 +281,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -285,14 +299,16 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -307,9 +323,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -329,10 +347,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -350,9 +369,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -366,20 +386,21 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v31
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v31
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -388,9 +409,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -404,20 +427,23 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v31
 ; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v41, v31
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -426,9 +452,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -449,11 +477,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s33
@@ -470,9 +499,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -486,19 +516,20 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s33
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s33, s4
@@ -508,9 +539,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -524,19 +557,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s33
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s33, s4
@@ -547,9 +582,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -569,9 +606,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -590,9 +628,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -606,19 +645,20 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[36:37]
 ; GFX10-NEXT:    s_add_u32 s36, s36, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s37, s37, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s34
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_mov_b32 s4, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    s_mov_b32 s34, s4
@@ -628,9 +668,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -644,19 +686,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s34
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, s34
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    s_mov_b32 s4, s34
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s34, s4
@@ -667,9 +711,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -689,10 +735,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v41, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -708,9 +755,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -724,19 +772,20 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v41, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v40
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v40
@@ -744,9 +793,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -760,19 +811,22 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v41, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v40
 ; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v40
@@ -780,9 +834,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -798,54 +854,54 @@ define hidden void @void_func_void_clobber_s33() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s33:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_writelane_b32 v0, s33, 0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s33, v0, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s33:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_xor_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    v_writelane_b32 v0, s33, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s33, v0, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_xor_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_void_clobber_s33:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v0, s33, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; clobber
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s33, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s33}"() #0
@@ -856,54 +912,54 @@ define hidden void @void_func_void_clobber_s34() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s34:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_writelane_b32 v0, s34, 0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s34, v0, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s34:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_xor_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    v_writelane_b32 v0, s34, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s34, v0, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_xor_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_void_clobber_s34:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v0, s34, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; clobber
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s34, v0, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s34}"() #0
@@ -918,10 +974,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s33 at rel32@lo+4
@@ -929,9 +986,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -945,21 +1003,24 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s33 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s33 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -973,22 +1034,26 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, void_func_void_clobber_s33 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_void_clobber_s33 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1006,10 +1071,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s34 at rel32@lo+4
@@ -1017,9 +1083,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1033,21 +1100,24 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s34 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s34 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1061,22 +1131,26 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, void_func_void_clobber_s34 at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, void_func_void_clobber_s34 at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1094,11 +1168,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
@@ -1114,9 +1189,10 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1130,19 +1206,20 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 0
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -1151,9 +1228,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1167,19 +1246,21 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s40
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, s40
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    s_mov_b32 s4, s40
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -1189,9 +1270,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1211,11 +1294,12 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -1240,9 +1324,10 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1256,24 +1341,25 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 0
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX10-NEXT:    s_getpc_b64 s[34:35]
 ; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -1286,9 +1372,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1302,24 +1390,26 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s40
 ; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    s_mov_b32 s4, s40
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v32
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, s40
 ; GFX11-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -1332,9 +1422,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 780d6a8680ff8e..314c07ad9b9648 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -79,7 +79,7 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX11-LABEL: call_i1:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
@@ -100,7 +100,7 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -184,7 +184,7 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX11-LABEL: call_i16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
@@ -205,7 +205,7 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -289,7 +289,7 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX11-LABEL: call_2xi16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
@@ -310,7 +310,7 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -402,7 +402,7 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX11-LABEL: call_3xi16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
@@ -423,7 +423,7 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -890,7 +890,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-LABEL: call_100xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill
@@ -976,7 +976,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xff70
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2835,7 +2835,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
@@ -3001,28 +3001,28 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:600
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:604
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:608
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:612
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:616
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:624
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:628
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
@@ -3072,14 +3072,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v28
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v29
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v30
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
@@ -3103,7 +3103,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
 ; GFX9-NEXT:    s_mov_b32 s33, s36
@@ -3117,7 +3117,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
@@ -3262,28 +3262,28 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:524
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:528
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:532
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:536
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:540
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_clause 0x15
 ; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
 ; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
@@ -3350,14 +3350,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1540
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1544
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1548
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1552
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1556
-; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1560
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1564
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:1568
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1548
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1552
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1556
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1560
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:1564
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
@@ -3381,7 +3381,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
@@ -3397,7 +3397,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1600 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3495,7 +3495,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1584 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:528
 ; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
@@ -3511,13 +3511,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v36
 ; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
@@ -3560,13 +3560,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
-; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
-; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
+; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
+; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
@@ -3590,7 +3590,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1600 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
 ; GFX11-NEXT:    s_mov_b32 s33, s45

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 1eebc8e7953e3e..fc5af655f93edd 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -397,8 +397,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 18
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -465,9 +466,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 18
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -481,8 +483,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
-; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
+; GISEL-NEXT:    v_writelane_b32 v41, s16, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -549,9 +552,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s4, v41, 0
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
@@ -569,8 +573,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 18
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -640,9 +645,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 18
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -656,8 +662,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
-; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
+; GISEL-NEXT:    v_writelane_b32 v41, s16, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -725,9 +732,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s4, v41, 0
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
@@ -745,8 +753,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 18
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -815,9 +824,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 18
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -831,8 +841,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
-; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
+; GISEL-NEXT:    v_writelane_b32 v41, s16, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -901,9 +912,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s4, v41, 0
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
@@ -922,8 +934,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 20
+; GCN-NEXT:    v_writelane_b32 v41, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -1001,9 +1014,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_readlane_b32 s4, v40, 20
+; GCN-NEXT:    v_readlane_b32 s4, v41, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -1017,8 +1031,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
-; GISEL-NEXT:    v_writelane_b32 v40, s16, 20
+; GISEL-NEXT:    v_writelane_b32 v41, s16, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -1096,9 +1111,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s4, v41, 0
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
@@ -1311,7 +1327,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s10, s33
+; GCN-NEXT:    s_mov_b32 s12, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -1402,14 +1418,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s10
+; GCN-NEXT:    s_mov_b32 s33, s12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s10, s33
+; GISEL-NEXT:    s_mov_b32 s12, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -1500,7 +1516,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s10
+; GISEL-NEXT:    s_mov_b32 s33, s12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void %fptr(i32 %i)
@@ -1515,7 +1531,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s10, s33
+; GCN-NEXT:    s_mov_b32 s12, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1604,14 +1620,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s10
+; GCN-NEXT:    s_mov_b32 s33, s12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s10, s33
+; GISEL-NEXT:    s_mov_b32 s12, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1700,7 +1716,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s10
+; GISEL-NEXT:    s_mov_b32 s33, s12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = call amdgpu_gfx i32 %fptr(i32 %i)
@@ -1712,7 +1728,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s10, s33
+; GCN-NEXT:    s_mov_b32 s12, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1798,14 +1814,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s10
+; GCN-NEXT:    s_mov_b32 s33, s12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s10, s33
+; GISEL-NEXT:    s_mov_b32 s12, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1891,7 +1907,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s10
+; GISEL-NEXT:    s_mov_b32 s33, s12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   tail call amdgpu_gfx void %fptr()

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 760601d3c70e68..3aa05207c62f94 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -7,7 +7,7 @@ define void @f0() {
 ; GFX11-LABEL: f0:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
@@ -28,7 +28,7 @@ define void @f0() {
 ; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -53,20 +53,20 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-LABEL: f2:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-NEXT:    s_load_b32 s2, s[16:17], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_load_b32 s24, s[16:17], 0x24
 ; GFX11-NEXT:    s_mov_b32 s18, s14
 ; GFX11-NEXT:    s_mov_b32 s12, s13
-; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    s_mov_b32 s20, 0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_mov_b32 s19, exec_lo
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s24, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_13
@@ -74,7 +74,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-NEXT:    s_cselect_b32 s25, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
 ; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
 ; GFX11-NEXT:    s_mov_b32 s21, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
@@ -90,41 +90,40 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s1, -1
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_4
+; GFX11-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11-NEXT:    s_branch .LBB2_12
 ; GFX11-NEXT:  .LBB2_3:
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:  .LBB2_4: ; %bb16
-; GFX11-NEXT:    s_load_b32 s2, s[16:17], 0x54
+; GFX11-NEXT:    s_load_b32 s3, s[16:17], 0x54
 ; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    s_and_b32 s3, s23, 1
+; GFX11-NEXT:    s_and_b32 s9, s23, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_bitcmp1_b32 s3, 0
+; GFX11-NEXT:    s_mov_b32 s3, -1
 ; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
 ; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
 ; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_hi_u32 s2, s29, s28
-; GFX11-NEXT:    s_mul_i32 s3, s29, s28
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s2, s3, 1
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s25
+; GFX11-NEXT:    s_mul_hi_u32 s3, s29, s28
+; GFX11-NEXT:    s_mul_i32 s9, s29, s28
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_alignbit_b32 v0, s3, s9, 1
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s2, s2, 1
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s30
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_i32 s2, s2, s22
-; GFX11-NEXT:    s_mul_i32 s2, s2, s20
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s2, s24, s2
+; GFX11-NEXT:    s_or_b32 s3, s3, 1
+; GFX11-NEXT:    s_lshr_b32 s3, s3, s30
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mul_i32 s9, s3, s22
+; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    s_mul_i32 s9, s9, s20
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], 1
 ; GFX11-NEXT:    global_load_u16 v2, v1, s[22:23]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -152,10 +151,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_or_b32 s3, s2, s3
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
 ; GFX11-NEXT:  ; %bb.7: ; %Flow
-; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:  .LBB2_8: ; %Flow12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s3
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
 ; GFX11-NEXT:  ; %bb.9:
 ; GFX11-NEXT:    s_xor_b32 s0, s8, -1
@@ -167,11 +166,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
 ; GFX11-NEXT:    s_mov_b32 s21, -1
 ; GFX11-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-NEXT:    s_and_b32 s3, s1, exec_lo
+; GFX11-NEXT:    s_and_b32 s20, s1, exec_lo
 ; GFX11-NEXT:    s_or_not1_b32 s0, s21, exec_lo
 ; GFX11-NEXT:  .LBB2_13: ; %Flow9
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s19
-; GFX11-NEXT:    s_and_saveexec_b32 s19, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s2, s0
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_15
 ; GFX11-NEXT:  ; %bb.14: ; %bb43
 ; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
@@ -184,10 +183,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_or_b32 s3, s3, exec_lo
+; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
 ; GFX11-NEXT:  .LBB2_15: ; %Flow14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s19
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s3
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
 ; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
 ; GFX11-NEXT:    ; divergent unreachable
 ; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 8607535dd849cd..f196192f723f94 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -13,30 +13,17 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3
-; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s12, s33, 0x100200
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    v_writelane_b32 v40, s16, 0
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s12, s14
-; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
+; CHECK-NEXT:    v_readlane_b32 s14, v40, 0
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s8, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_load_dword s8, s[16:17], 0x0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s8, s33, 0x100200
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v40, s8, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def vgpr10
 ; CHECK-NEXT:    ;;#ASMEND
@@ -69,14 +56,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_add_i32 s4, s33, 0x100100
 ; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_cmp_eq_u32 s4, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4000
@@ -84,24 +66,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %store
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_add_i32 s4, s33, 0x100000
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b32 v1, v2
-; CHECK-NEXT:    ; kill: killed $vgpr0
+; CHECK-NEXT:    ds_write_b32 v0, v1
 ; CHECK-NEXT:    s_endpgm
 ; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
   %arr = alloca < 1339 x i32>, align 8192, addrspace(5)
   %cmp = icmp ne i32 %val, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index e96c8a9fafa4c6..63821c01e5b3c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -116,7 +116,6 @@
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI lower SGPR spill instructions
 ; GCN-O0-NEXT:        Fast Register Allocator
-; GCN-O0-NEXT:        SI Lower WWM Copies
 ; GCN-O0-NEXT:        SI Fix VGPR copies
 ; GCN-O0-NEXT:        Remove Redundant DEBUG_VALUE analysis
 ; GCN-O0-NEXT:        Fixup Statepoint Caller Saved
@@ -361,7 +360,6 @@
 ; GCN-O1-NEXT:        Virtual Register Map
 ; GCN-O1-NEXT:        Live Register Matrix
 ; GCN-O1-NEXT:        Greedy Register Allocator
-; GCN-O1-NEXT:        SI Lower WWM Copies
 ; GCN-O1-NEXT:        GCN NSA Reassign
 ; GCN-O1-NEXT:        Virtual Register Rewriter
 ; GCN-O1-NEXT:        Stack Slot Coloring
@@ -655,7 +653,6 @@
 ; GCN-O1-OPTS-NEXT:        Virtual Register Map
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
-; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
 ; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
@@ -959,7 +956,6 @@
 ; GCN-O2-NEXT:        Virtual Register Map
 ; GCN-O2-NEXT:        Live Register Matrix
 ; GCN-O2-NEXT:        Greedy Register Allocator
-; GCN-O2-NEXT:        SI Lower WWM Copies
 ; GCN-O2-NEXT:        GCN NSA Reassign
 ; GCN-O2-NEXT:        Virtual Register Rewriter
 ; GCN-O2-NEXT:        Stack Slot Coloring
@@ -1275,7 +1271,6 @@
 ; GCN-O3-NEXT:        Virtual Register Map
 ; GCN-O3-NEXT:        Live Register Matrix
 ; GCN-O3-NEXT:        Greedy Register Allocator
-; GCN-O3-NEXT:        SI Lower WWM Copies
 ; GCN-O3-NEXT:        GCN NSA Reassign
 ; GCN-O3-NEXT:        Virtual Register Rewriter
 ; GCN-O3-NEXT:        Stack Slot Coloring

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index f660d8df840522..9b596205851c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -141,108 +141,82 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0:       ; %bb.0:
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr5
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
 ; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 2
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    v_writelane_b32 v5, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v5, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v5, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v5, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 8
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v5, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v5, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v5, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v5, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v5, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB0_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -494,221 +468,185 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr13
-; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v4
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v4
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v1
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v3
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
+; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_mov_b32_e32 v16, v4
-; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_mov_b32_e32 v17, v3
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v6
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v12
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v10
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 2
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 8
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 9
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 10
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 9
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 10
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 11
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 12
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 14
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 11
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 12
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 13
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 14
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 16
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 15
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 16
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 15
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 16
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 11
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 12
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 13
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 14
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 11
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 12
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
 ; W64-O0-NEXT:  ; %bb.6:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
+; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1016,250 +954,204 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr8
-; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v5
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v11, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v11, v2
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
+; W64-O0-NEXT:    v_mov_b32_e32 v13, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v11
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v9
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v13
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v11
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v10
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v5
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v12
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v12
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    ;;#ASMSTART
 ; W64-O0-NEXT:    s_mov_b32 s4, 17
 ; W64-O0-NEXT:    ;;#ASMEND
 ; W64-O0-NEXT:    s_mov_b32 s5, s4
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 0
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 3
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 1
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 6
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 7
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 4
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 5
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 6
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 7
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 9
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 8
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 9
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 4
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 5
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 6
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 7
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 9
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
-; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 2
+; W64-O0-NEXT:    v_readlane_b32 s7, v8, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
-; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
+; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 11
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 10
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 11
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
 ; W64-O0-NEXT:  ; %bb.4: ; %bb1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 12
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 12
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 14
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 13
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 14
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 16
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 17
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 18
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 15
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 16
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 17
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 18
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 19
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 20
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 19
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 20
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 19
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 20
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 15
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 16
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 17
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 18
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 12
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 19
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 20
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 13
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 10
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 7df4f98ddebfa5..9f8f3055cb4e76 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -140,123 +140,94 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0:       ; %bb.0:
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr5
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
 ; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 2
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    v_writelane_b32 v5, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v5, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v5, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v5, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 8
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v5, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v5, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v5, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v5, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v5, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB0_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -508,89 +479,86 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr13
-; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v1
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v5
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v7
+; W64-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v16
 ; W64-O0-NEXT:    v_mov_b32_e32 v6, v15
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v14
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v15
+; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v7
+; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v8
 ; W64-O0-NEXT:    v_mov_b32_e32 v16, v6
-; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v17, v5
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v9
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v8
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v3
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v6
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v12
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v10
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -598,156 +566,122 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 2
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 8
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 9
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 10
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 9
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 10
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 11
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 12
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 14
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 11
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 12
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 13
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 14
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 16
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 15
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 16
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 15
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 16
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 11
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 12
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 13
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 14
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 11
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 12
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
 ; W64-O0-NEXT:  ; %bb.6:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
+; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1055,288 +989,239 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr8
-; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v5
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v2
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v2
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v1
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v14
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v13
+; W64-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v11, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v11
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v10
+; W64-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; W64-O0-NEXT:    v_mov_b32_e32 v13, v9
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v13
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v11, v4
-; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v3
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v12
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ;;#ASMSTART
 ; W64-O0-NEXT:    s_mov_b32 s4, 17
 ; W64-O0-NEXT:    ;;#ASMEND
 ; W64-O0-NEXT:    s_mov_b32 s5, s4
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 0
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 3
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 1
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 6
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 7
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 4
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 5
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 6
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 7
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 9
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 8
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 9
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 4
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 5
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 6
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 7
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 9
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_1
 ; W64-O0-NEXT:  ; %bb.3:
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
-; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 2
+; W64-O0-NEXT:    v_readlane_b32 s7, v8, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
-; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
+; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 11
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 10
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 11
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
 ; W64-O0-NEXT:  ; %bb.4: ; %bb1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(4)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v0, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 12
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 12
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 14
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 13
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 14
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 16
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 17
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 18
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 15
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 16
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 17
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 18
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 19
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 20
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 19
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 20
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 19
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 20
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 15
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 16
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 17
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 18
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 12
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 19
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 20
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    s_nop 3
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 13
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 10
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 797b13044e7226..ba287349c5756f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -192,12 +192,13 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v44, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s36, 3
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
@@ -229,9 +230,10 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s4, v44, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index b04069e5003a27..c2c6c84e8f972d 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-LABEL: csr_vgpr_spill_fp_callee:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s24, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -54,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_mov_b32 s33, s24
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -66,16 +66,12 @@ bb:
 define amdgpu_kernel void @kernel_call() {
 ; CHECK-LABEL: kernel_call:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s32, 0x400
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
@@ -93,10 +89,6 @@ define amdgpu_kernel void @kernel_call() {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 bb:
   tail call fastcc void @csr_vgpr_spill_fp_callee()
@@ -121,9 +113,9 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
 ; CHECK-NEXT:    s_addc_u32 s17, s17, callee_has_fp at rel32@hi+12
 ; CHECK-NEXT:    v_readlane_b32 s33, v1, 0
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_xor_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    s_xor_saveexec_b64 s[20:21], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    s_mov_b64 exec, s[20:21]
 ; CHECK-NEXT:    s_setpc_b64 s[16:17]
 bb:
   call void asm sideeffect "; clobber csr v40", "~{v40}"()
@@ -134,16 +126,12 @@ bb:
 define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-LABEL: kernel_tailcall:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s32, 0x400
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
@@ -161,10 +149,6 @@ define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 bb:
   tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
@@ -188,7 +172,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s24, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -210,7 +194,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_mov_b32 s33, s24
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -222,7 +206,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-LABEL: caller_save_vgpr_spill_fp:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s19, s33
+; CHECK-NEXT:    s_mov_b32 s25, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -244,7 +228,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT:    s_mov_b32 s33, s19
+; CHECK-NEXT:    s_mov_b32 s33, s25
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -255,16 +239,12 @@ entry:
 define protected amdgpu_kernel void @kernel() {
 ; CHECK-LABEL: kernel:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b32 s32, 0x400
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
@@ -282,11 +262,6 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    ; kill: def $vgpr1 killed $vgpr0 killed $exec
-; CHECK-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[24:25]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 entry:
   %call = call i32 @caller_save_vgpr_spill_fp()

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 7b6ba788fd3ed0..4b592f61c8ce74 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -15,8 +15,9 @@ declare void @external_void_func_i32(i32) #0
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
+; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
 ; GCN-DAG: v_writelane_b32 v40, s30, 0
 ; GCN-DAG: v_writelane_b32 v40, s31, 1
 
@@ -25,9 +26,10 @@ declare void @external_void_func_i32(i32) #0
 ; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
 
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
+; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]

diff  --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 9999cb9173b5db..c71938eb818828 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -17,8 +17,9 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s16, 0
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
@@ -38,11 +39,12 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 0
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
+; CHECK-NEXT:    .loc    0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index f223a447318112..1a3f42b115a8dc 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -11,107 +11,95 @@
 define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 {
 ; GCN-LABEL: spill_sgprs_to_multiple_vgprs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s92, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s93, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s94, -1
-; GCN-NEXT:    s_mov_b32 s95, 0xe8f000
-; GCN-NEXT:    s_add_u32 s92, s92, s11
-; GCN-NEXT:    s_addc_u32 s93, s93, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr2
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 0
-; GCN-NEXT:    v_writelane_b32 v2, s5, 1
-; GCN-NEXT:    v_writelane_b32 v2, s6, 2
-; GCN-NEXT:    v_writelane_b32 v2, s7, 3
-; GCN-NEXT:    v_writelane_b32 v2, s8, 4
-; GCN-NEXT:    v_writelane_b32 v2, s9, 5
-; GCN-NEXT:    v_writelane_b32 v2, s10, 6
-; GCN-NEXT:    v_writelane_b32 v2, s11, 7
+; GCN-NEXT:    v_writelane_b32 v0, s4, 0
+; GCN-NEXT:    v_writelane_b32 v0, s5, 1
+; GCN-NEXT:    v_writelane_b32 v0, s6, 2
+; GCN-NEXT:    v_writelane_b32 v0, s7, 3
+; GCN-NEXT:    v_writelane_b32 v0, s8, 4
+; GCN-NEXT:    v_writelane_b32 v0, s9, 5
+; GCN-NEXT:    v_writelane_b32 v0, s10, 6
+; GCN-NEXT:    v_writelane_b32 v0, s11, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 8
-; GCN-NEXT:    v_writelane_b32 v2, s5, 9
-; GCN-NEXT:    v_writelane_b32 v2, s6, 10
-; GCN-NEXT:    v_writelane_b32 v2, s7, 11
-; GCN-NEXT:    v_writelane_b32 v2, s8, 12
-; GCN-NEXT:    v_writelane_b32 v2, s9, 13
-; GCN-NEXT:    v_writelane_b32 v2, s10, 14
-; GCN-NEXT:    v_writelane_b32 v2, s11, 15
+; GCN-NEXT:    v_writelane_b32 v0, s4, 8
+; GCN-NEXT:    v_writelane_b32 v0, s5, 9
+; GCN-NEXT:    v_writelane_b32 v0, s6, 10
+; GCN-NEXT:    v_writelane_b32 v0, s7, 11
+; GCN-NEXT:    v_writelane_b32 v0, s8, 12
+; GCN-NEXT:    v_writelane_b32 v0, s9, 13
+; GCN-NEXT:    v_writelane_b32 v0, s10, 14
+; GCN-NEXT:    v_writelane_b32 v0, s11, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 16
-; GCN-NEXT:    v_writelane_b32 v2, s5, 17
-; GCN-NEXT:    v_writelane_b32 v2, s6, 18
-; GCN-NEXT:    v_writelane_b32 v2, s7, 19
-; GCN-NEXT:    v_writelane_b32 v2, s8, 20
-; GCN-NEXT:    v_writelane_b32 v2, s9, 21
-; GCN-NEXT:    v_writelane_b32 v2, s10, 22
-; GCN-NEXT:    v_writelane_b32 v2, s11, 23
+; GCN-NEXT:    v_writelane_b32 v0, s4, 16
+; GCN-NEXT:    v_writelane_b32 v0, s5, 17
+; GCN-NEXT:    v_writelane_b32 v0, s6, 18
+; GCN-NEXT:    v_writelane_b32 v0, s7, 19
+; GCN-NEXT:    v_writelane_b32 v0, s8, 20
+; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    v_writelane_b32 v0, s10, 22
+; GCN-NEXT:    v_writelane_b32 v0, s11, 23
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 24
-; GCN-NEXT:    v_writelane_b32 v2, s5, 25
-; GCN-NEXT:    v_writelane_b32 v2, s6, 26
-; GCN-NEXT:    v_writelane_b32 v2, s7, 27
-; GCN-NEXT:    v_writelane_b32 v2, s8, 28
-; GCN-NEXT:    v_writelane_b32 v2, s9, 29
-; GCN-NEXT:    v_writelane_b32 v2, s10, 30
-; GCN-NEXT:    v_writelane_b32 v2, s11, 31
+; GCN-NEXT:    v_writelane_b32 v0, s4, 24
+; GCN-NEXT:    v_writelane_b32 v0, s5, 25
+; GCN-NEXT:    v_writelane_b32 v0, s6, 26
+; GCN-NEXT:    v_writelane_b32 v0, s7, 27
+; GCN-NEXT:    v_writelane_b32 v0, s8, 28
+; GCN-NEXT:    v_writelane_b32 v0, s9, 29
+; GCN-NEXT:    v_writelane_b32 v0, s10, 30
+; GCN-NEXT:    v_writelane_b32 v0, s11, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 32
-; GCN-NEXT:    v_writelane_b32 v2, s5, 33
-; GCN-NEXT:    v_writelane_b32 v2, s6, 34
-; GCN-NEXT:    v_writelane_b32 v2, s7, 35
-; GCN-NEXT:    v_writelane_b32 v2, s8, 36
-; GCN-NEXT:    v_writelane_b32 v2, s9, 37
-; GCN-NEXT:    v_writelane_b32 v2, s10, 38
-; GCN-NEXT:    v_writelane_b32 v2, s11, 39
+; GCN-NEXT:    v_writelane_b32 v0, s4, 32
+; GCN-NEXT:    v_writelane_b32 v0, s5, 33
+; GCN-NEXT:    v_writelane_b32 v0, s6, 34
+; GCN-NEXT:    v_writelane_b32 v0, s7, 35
+; GCN-NEXT:    v_writelane_b32 v0, s8, 36
+; GCN-NEXT:    v_writelane_b32 v0, s9, 37
+; GCN-NEXT:    v_writelane_b32 v0, s10, 38
+; GCN-NEXT:    v_writelane_b32 v0, s11, 39
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 40
-; GCN-NEXT:    v_writelane_b32 v2, s5, 41
-; GCN-NEXT:    v_writelane_b32 v2, s6, 42
-; GCN-NEXT:    v_writelane_b32 v2, s7, 43
-; GCN-NEXT:    v_writelane_b32 v2, s8, 44
-; GCN-NEXT:    v_writelane_b32 v2, s9, 45
-; GCN-NEXT:    v_writelane_b32 v2, s10, 46
-; GCN-NEXT:    v_writelane_b32 v2, s11, 47
+; GCN-NEXT:    v_writelane_b32 v0, s4, 40
+; GCN-NEXT:    v_writelane_b32 v0, s5, 41
+; GCN-NEXT:    v_writelane_b32 v0, s6, 42
+; GCN-NEXT:    v_writelane_b32 v0, s7, 43
+; GCN-NEXT:    v_writelane_b32 v0, s8, 44
+; GCN-NEXT:    v_writelane_b32 v0, s9, 45
+; GCN-NEXT:    v_writelane_b32 v0, s10, 46
+; GCN-NEXT:    v_writelane_b32 v0, s11, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 48
-; GCN-NEXT:    v_writelane_b32 v2, s5, 49
-; GCN-NEXT:    v_writelane_b32 v2, s6, 50
-; GCN-NEXT:    v_writelane_b32 v2, s7, 51
-; GCN-NEXT:    v_writelane_b32 v2, s8, 52
-; GCN-NEXT:    v_writelane_b32 v2, s9, 53
-; GCN-NEXT:    v_writelane_b32 v2, s10, 54
-; GCN-NEXT:    v_writelane_b32 v2, s11, 55
+; GCN-NEXT:    v_writelane_b32 v0, s4, 48
+; GCN-NEXT:    v_writelane_b32 v0, s5, 49
+; GCN-NEXT:    v_writelane_b32 v0, s6, 50
+; GCN-NEXT:    v_writelane_b32 v0, s7, 51
+; GCN-NEXT:    v_writelane_b32 v0, s8, 52
+; GCN-NEXT:    v_writelane_b32 v0, s9, 53
+; GCN-NEXT:    v_writelane_b32 v0, s10, 54
+; GCN-NEXT:    v_writelane_b32 v0, s11, 55
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v2, s4, 56
-; GCN-NEXT:    v_writelane_b32 v2, s5, 57
-; GCN-NEXT:    v_writelane_b32 v2, s6, 58
-; GCN-NEXT:    v_writelane_b32 v2, s7, 59
-; GCN-NEXT:    v_writelane_b32 v2, s8, 60
-; GCN-NEXT:    v_writelane_b32 v2, s9, 61
-; GCN-NEXT:    v_writelane_b32 v2, s10, 62
-; GCN-NEXT:    v_writelane_b32 v2, s11, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_writelane_b32 v0, s4, 56
+; GCN-NEXT:    v_writelane_b32 v0, s5, 57
+; GCN-NEXT:    v_writelane_b32 v0, s6, 58
+; GCN-NEXT:    v_writelane_b32 v0, s7, 59
+; GCN-NEXT:    v_writelane_b32 v0, s8, 60
+; GCN-NEXT:    v_writelane_b32 v0, s9, 61
+; GCN-NEXT:    v_writelane_b32 v0, s10, 62
+; GCN-NEXT:    v_writelane_b32 v0, s11, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
@@ -200,192 +188,176 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v1, s9, 61
 ; GCN-NEXT:    v_writelane_b32 v1, s10, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s11, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
-; GCN-NEXT:    v_writelane_b32 v0, s5, 1
-; GCN-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-NEXT:    v_writelane_b32 v0, s7, 3
-; GCN-NEXT:    v_writelane_b32 v0, s8, 4
-; GCN-NEXT:    v_writelane_b32 v0, s9, 5
-; GCN-NEXT:    v_writelane_b32 v0, s10, 6
-; GCN-NEXT:    v_writelane_b32 v0, s11, 7
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_writelane_b32 v2, s4, 0
+; GCN-NEXT:    v_writelane_b32 v2, s5, 1
+; GCN-NEXT:    v_writelane_b32 v2, s6, 2
+; GCN-NEXT:    v_writelane_b32 v2, s7, 3
+; GCN-NEXT:    v_writelane_b32 v2, s8, 4
+; GCN-NEXT:    v_writelane_b32 v2, s9, 5
+; GCN-NEXT:    v_writelane_b32 v2, s10, 6
+; GCN-NEXT:    v_writelane_b32 v2, s11, 7
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, s1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s8, v2, 56
-; GCN-NEXT:    v_readlane_b32 s9, v2, 57
-; GCN-NEXT:    v_readlane_b32 s10, v2, 58
-; GCN-NEXT:    v_readlane_b32 s11, v2, 59
-; GCN-NEXT:    v_readlane_b32 s12, v2, 60
-; GCN-NEXT:    v_readlane_b32 s13, v2, 61
-; GCN-NEXT:    v_readlane_b32 s14, v2, 62
-; GCN-NEXT:    v_readlane_b32 s15, v2, 63
-; GCN-NEXT:    v_readlane_b32 s16, v2, 48
-; GCN-NEXT:    v_readlane_b32 s17, v2, 49
-; GCN-NEXT:    v_readlane_b32 s18, v2, 50
-; GCN-NEXT:    v_readlane_b32 s19, v2, 51
-; GCN-NEXT:    v_readlane_b32 s20, v2, 52
-; GCN-NEXT:    v_readlane_b32 s21, v2, 53
-; GCN-NEXT:    v_readlane_b32 s22, v2, 54
-; GCN-NEXT:    v_readlane_b32 s23, v2, 55
-; GCN-NEXT:    v_readlane_b32 s24, v2, 40
-; GCN-NEXT:    v_readlane_b32 s25, v2, 41
-; GCN-NEXT:    v_readlane_b32 s26, v2, 42
-; GCN-NEXT:    v_readlane_b32 s27, v2, 43
-; GCN-NEXT:    v_readlane_b32 s28, v2, 44
-; GCN-NEXT:    v_readlane_b32 s29, v2, 45
-; GCN-NEXT:    v_readlane_b32 s30, v2, 46
-; GCN-NEXT:    v_readlane_b32 s31, v2, 47
-; GCN-NEXT:    v_readlane_b32 s36, v2, 32
-; GCN-NEXT:    v_readlane_b32 s37, v2, 33
-; GCN-NEXT:    v_readlane_b32 s38, v2, 34
-; GCN-NEXT:    v_readlane_b32 s39, v2, 35
-; GCN-NEXT:    v_readlane_b32 s40, v2, 36
-; GCN-NEXT:    v_readlane_b32 s41, v2, 37
-; GCN-NEXT:    v_readlane_b32 s42, v2, 38
-; GCN-NEXT:    v_readlane_b32 s43, v2, 39
-; GCN-NEXT:    v_readlane_b32 s44, v2, 24
-; GCN-NEXT:    v_readlane_b32 s45, v2, 25
-; GCN-NEXT:    v_readlane_b32 s46, v2, 26
-; GCN-NEXT:    v_readlane_b32 s47, v2, 27
-; GCN-NEXT:    v_readlane_b32 s48, v2, 28
-; GCN-NEXT:    v_readlane_b32 s49, v2, 29
-; GCN-NEXT:    v_readlane_b32 s50, v2, 30
-; GCN-NEXT:    v_readlane_b32 s51, v2, 31
-; GCN-NEXT:    v_readlane_b32 s52, v2, 16
-; GCN-NEXT:    v_readlane_b32 s53, v2, 17
-; GCN-NEXT:    v_readlane_b32 s54, v2, 18
-; GCN-NEXT:    v_readlane_b32 s55, v2, 19
-; GCN-NEXT:    v_readlane_b32 s56, v2, 20
-; GCN-NEXT:    v_readlane_b32 s57, v2, 21
-; GCN-NEXT:    v_readlane_b32 s58, v2, 22
-; GCN-NEXT:    v_readlane_b32 s59, v2, 23
-; GCN-NEXT:    v_readlane_b32 s60, v2, 8
-; GCN-NEXT:    v_readlane_b32 s61, v2, 9
-; GCN-NEXT:    v_readlane_b32 s62, v2, 10
-; GCN-NEXT:    v_readlane_b32 s63, v2, 11
-; GCN-NEXT:    v_readlane_b32 s64, v2, 12
-; GCN-NEXT:    v_readlane_b32 s65, v2, 13
-; GCN-NEXT:    v_readlane_b32 s66, v2, 14
-; GCN-NEXT:    v_readlane_b32 s67, v2, 15
-; GCN-NEXT:    v_readlane_b32 s68, v2, 0
-; GCN-NEXT:    v_readlane_b32 s69, v2, 1
-; GCN-NEXT:    v_readlane_b32 s70, v2, 2
-; GCN-NEXT:    v_readlane_b32 s71, v2, 3
-; GCN-NEXT:    v_readlane_b32 s72, v2, 4
-; GCN-NEXT:    v_readlane_b32 s73, v2, 5
-; GCN-NEXT:    v_readlane_b32 s74, v2, 6
-; GCN-NEXT:    v_readlane_b32 s75, v2, 7
-; GCN-NEXT:    v_readlane_b32 s76, v1, 56
-; GCN-NEXT:    v_readlane_b32 s77, v1, 57
-; GCN-NEXT:    v_readlane_b32 s78, v1, 58
-; GCN-NEXT:    v_readlane_b32 s79, v1, 59
-; GCN-NEXT:    v_readlane_b32 s80, v1, 60
-; GCN-NEXT:    v_readlane_b32 s81, v1, 61
-; GCN-NEXT:    v_readlane_b32 s82, v1, 62
-; GCN-NEXT:    v_readlane_b32 s83, v1, 63
-; GCN-NEXT:    v_readlane_b32 s84, v1, 48
-; GCN-NEXT:    v_readlane_b32 s85, v1, 49
-; GCN-NEXT:    v_readlane_b32 s86, v1, 50
-; GCN-NEXT:    v_readlane_b32 s87, v1, 51
-; GCN-NEXT:    v_readlane_b32 s88, v1, 52
-; GCN-NEXT:    v_readlane_b32 s89, v1, 53
-; GCN-NEXT:    v_readlane_b32 s90, v1, 54
-; GCN-NEXT:    v_readlane_b32 s91, v1, 55
-; GCN-NEXT:    v_readlane_b32 s0, v1, 0
-; GCN-NEXT:    v_readlane_b32 s1, v1, 1
-; GCN-NEXT:    v_readlane_b32 s2, v1, 2
-; GCN-NEXT:    v_readlane_b32 s3, v1, 3
-; GCN-NEXT:    v_readlane_b32 s4, v1, 4
-; GCN-NEXT:    v_readlane_b32 s5, v1, 5
-; GCN-NEXT:    v_readlane_b32 s6, v1, 6
-; GCN-NEXT:    v_readlane_b32 s7, v1, 7
+; GCN-NEXT:    v_readlane_b32 s8, v1, 56
+; GCN-NEXT:    v_readlane_b32 s9, v1, 57
+; GCN-NEXT:    v_readlane_b32 s10, v1, 58
+; GCN-NEXT:    v_readlane_b32 s11, v1, 59
+; GCN-NEXT:    v_readlane_b32 s12, v1, 60
+; GCN-NEXT:    v_readlane_b32 s13, v1, 61
+; GCN-NEXT:    v_readlane_b32 s14, v1, 62
+; GCN-NEXT:    v_readlane_b32 s15, v1, 63
+; GCN-NEXT:    v_readlane_b32 s16, v1, 48
+; GCN-NEXT:    v_readlane_b32 s17, v1, 49
+; GCN-NEXT:    v_readlane_b32 s18, v1, 50
+; GCN-NEXT:    v_readlane_b32 s19, v1, 51
+; GCN-NEXT:    v_readlane_b32 s20, v1, 52
+; GCN-NEXT:    v_readlane_b32 s21, v1, 53
+; GCN-NEXT:    v_readlane_b32 s22, v1, 54
+; GCN-NEXT:    v_readlane_b32 s23, v1, 55
+; GCN-NEXT:    v_readlane_b32 s24, v1, 40
+; GCN-NEXT:    v_readlane_b32 s25, v1, 41
+; GCN-NEXT:    v_readlane_b32 s26, v1, 42
+; GCN-NEXT:    v_readlane_b32 s27, v1, 43
+; GCN-NEXT:    v_readlane_b32 s28, v1, 44
+; GCN-NEXT:    v_readlane_b32 s29, v1, 45
+; GCN-NEXT:    v_readlane_b32 s30, v1, 46
+; GCN-NEXT:    v_readlane_b32 s31, v1, 47
+; GCN-NEXT:    v_readlane_b32 s36, v1, 32
+; GCN-NEXT:    v_readlane_b32 s37, v1, 33
+; GCN-NEXT:    v_readlane_b32 s38, v1, 34
+; GCN-NEXT:    v_readlane_b32 s39, v1, 35
+; GCN-NEXT:    v_readlane_b32 s40, v1, 36
+; GCN-NEXT:    v_readlane_b32 s41, v1, 37
+; GCN-NEXT:    v_readlane_b32 s42, v1, 38
+; GCN-NEXT:    v_readlane_b32 s43, v1, 39
+; GCN-NEXT:    v_readlane_b32 s44, v1, 24
+; GCN-NEXT:    v_readlane_b32 s45, v1, 25
+; GCN-NEXT:    v_readlane_b32 s46, v1, 26
+; GCN-NEXT:    v_readlane_b32 s47, v1, 27
+; GCN-NEXT:    v_readlane_b32 s48, v1, 28
+; GCN-NEXT:    v_readlane_b32 s49, v1, 29
+; GCN-NEXT:    v_readlane_b32 s50, v1, 30
+; GCN-NEXT:    v_readlane_b32 s51, v1, 31
+; GCN-NEXT:    v_readlane_b32 s52, v1, 16
+; GCN-NEXT:    v_readlane_b32 s53, v1, 17
+; GCN-NEXT:    v_readlane_b32 s54, v1, 18
+; GCN-NEXT:    v_readlane_b32 s55, v1, 19
+; GCN-NEXT:    v_readlane_b32 s56, v1, 20
+; GCN-NEXT:    v_readlane_b32 s57, v1, 21
+; GCN-NEXT:    v_readlane_b32 s58, v1, 22
+; GCN-NEXT:    v_readlane_b32 s59, v1, 23
+; GCN-NEXT:    v_readlane_b32 s60, v1, 8
+; GCN-NEXT:    v_readlane_b32 s61, v1, 9
+; GCN-NEXT:    v_readlane_b32 s62, v1, 10
+; GCN-NEXT:    v_readlane_b32 s63, v1, 11
+; GCN-NEXT:    v_readlane_b32 s64, v1, 12
+; GCN-NEXT:    v_readlane_b32 s65, v1, 13
+; GCN-NEXT:    v_readlane_b32 s66, v1, 14
+; GCN-NEXT:    v_readlane_b32 s67, v1, 15
+; GCN-NEXT:    v_readlane_b32 s68, v1, 0
+; GCN-NEXT:    v_readlane_b32 s69, v1, 1
+; GCN-NEXT:    v_readlane_b32 s70, v1, 2
+; GCN-NEXT:    v_readlane_b32 s71, v1, 3
+; GCN-NEXT:    v_readlane_b32 s72, v1, 4
+; GCN-NEXT:    v_readlane_b32 s73, v1, 5
+; GCN-NEXT:    v_readlane_b32 s74, v1, 6
+; GCN-NEXT:    v_readlane_b32 s75, v1, 7
+; GCN-NEXT:    v_readlane_b32 s76, v0, 56
+; GCN-NEXT:    v_readlane_b32 s77, v0, 57
+; GCN-NEXT:    v_readlane_b32 s78, v0, 58
+; GCN-NEXT:    v_readlane_b32 s79, v0, 59
+; GCN-NEXT:    v_readlane_b32 s80, v0, 60
+; GCN-NEXT:    v_readlane_b32 s81, v0, 61
+; GCN-NEXT:    v_readlane_b32 s82, v0, 62
+; GCN-NEXT:    v_readlane_b32 s83, v0, 63
+; GCN-NEXT:    v_readlane_b32 s84, v0, 48
+; GCN-NEXT:    v_readlane_b32 s85, v0, 49
+; GCN-NEXT:    v_readlane_b32 s86, v0, 50
+; GCN-NEXT:    v_readlane_b32 s87, v0, 51
+; GCN-NEXT:    v_readlane_b32 s88, v0, 52
+; GCN-NEXT:    v_readlane_b32 s89, v0, 53
+; GCN-NEXT:    v_readlane_b32 s90, v0, 54
+; GCN-NEXT:    v_readlane_b32 s91, v0, 55
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
+; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_readlane_b32 s2, v0, 2
+; GCN-NEXT:    v_readlane_b32 s3, v0, 3
+; GCN-NEXT:    v_readlane_b32 s4, v0, 4
+; GCN-NEXT:    v_readlane_b32 s5, v0, 5
+; GCN-NEXT:    v_readlane_b32 s6, v0, 6
+; GCN-NEXT:    v_readlane_b32 s7, v0, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 8
-; GCN-NEXT:    v_readlane_b32 s1, v1, 9
-; GCN-NEXT:    v_readlane_b32 s2, v1, 10
-; GCN-NEXT:    v_readlane_b32 s3, v1, 11
-; GCN-NEXT:    v_readlane_b32 s4, v1, 12
-; GCN-NEXT:    v_readlane_b32 s5, v1, 13
-; GCN-NEXT:    v_readlane_b32 s6, v1, 14
-; GCN-NEXT:    v_readlane_b32 s7, v1, 15
+; GCN-NEXT:    v_readlane_b32 s0, v0, 8
+; GCN-NEXT:    v_readlane_b32 s1, v0, 9
+; GCN-NEXT:    v_readlane_b32 s2, v0, 10
+; GCN-NEXT:    v_readlane_b32 s3, v0, 11
+; GCN-NEXT:    v_readlane_b32 s4, v0, 12
+; GCN-NEXT:    v_readlane_b32 s5, v0, 13
+; GCN-NEXT:    v_readlane_b32 s6, v0, 14
+; GCN-NEXT:    v_readlane_b32 s7, v0, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 16
-; GCN-NEXT:    v_readlane_b32 s1, v1, 17
-; GCN-NEXT:    v_readlane_b32 s2, v1, 18
-; GCN-NEXT:    v_readlane_b32 s3, v1, 19
-; GCN-NEXT:    v_readlane_b32 s4, v1, 20
-; GCN-NEXT:    v_readlane_b32 s5, v1, 21
-; GCN-NEXT:    v_readlane_b32 s6, v1, 22
-; GCN-NEXT:    v_readlane_b32 s7, v1, 23
+; GCN-NEXT:    v_readlane_b32 s0, v0, 16
+; GCN-NEXT:    v_readlane_b32 s1, v0, 17
+; GCN-NEXT:    v_readlane_b32 s2, v0, 18
+; GCN-NEXT:    v_readlane_b32 s3, v0, 19
+; GCN-NEXT:    v_readlane_b32 s4, v0, 20
+; GCN-NEXT:    v_readlane_b32 s5, v0, 21
+; GCN-NEXT:    v_readlane_b32 s6, v0, 22
+; GCN-NEXT:    v_readlane_b32 s7, v0, 23
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 24
-; GCN-NEXT:    v_readlane_b32 s1, v1, 25
-; GCN-NEXT:    v_readlane_b32 s2, v1, 26
-; GCN-NEXT:    v_readlane_b32 s3, v1, 27
-; GCN-NEXT:    v_readlane_b32 s4, v1, 28
-; GCN-NEXT:    v_readlane_b32 s5, v1, 29
-; GCN-NEXT:    v_readlane_b32 s6, v1, 30
-; GCN-NEXT:    v_readlane_b32 s7, v1, 31
+; GCN-NEXT:    v_readlane_b32 s0, v0, 24
+; GCN-NEXT:    v_readlane_b32 s1, v0, 25
+; GCN-NEXT:    v_readlane_b32 s2, v0, 26
+; GCN-NEXT:    v_readlane_b32 s3, v0, 27
+; GCN-NEXT:    v_readlane_b32 s4, v0, 28
+; GCN-NEXT:    v_readlane_b32 s5, v0, 29
+; GCN-NEXT:    v_readlane_b32 s6, v0, 30
+; GCN-NEXT:    v_readlane_b32 s7, v0, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 32
-; GCN-NEXT:    v_readlane_b32 s1, v1, 33
-; GCN-NEXT:    v_readlane_b32 s2, v1, 34
-; GCN-NEXT:    v_readlane_b32 s3, v1, 35
-; GCN-NEXT:    v_readlane_b32 s4, v1, 36
-; GCN-NEXT:    v_readlane_b32 s5, v1, 37
-; GCN-NEXT:    v_readlane_b32 s6, v1, 38
-; GCN-NEXT:    v_readlane_b32 s7, v1, 39
+; GCN-NEXT:    v_readlane_b32 s0, v0, 32
+; GCN-NEXT:    v_readlane_b32 s1, v0, 33
+; GCN-NEXT:    v_readlane_b32 s2, v0, 34
+; GCN-NEXT:    v_readlane_b32 s3, v0, 35
+; GCN-NEXT:    v_readlane_b32 s4, v0, 36
+; GCN-NEXT:    v_readlane_b32 s5, v0, 37
+; GCN-NEXT:    v_readlane_b32 s6, v0, 38
+; GCN-NEXT:    v_readlane_b32 s7, v0, 39
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 40
-; GCN-NEXT:    v_readlane_b32 s1, v1, 41
-; GCN-NEXT:    v_readlane_b32 s2, v1, 42
-; GCN-NEXT:    v_readlane_b32 s3, v1, 43
-; GCN-NEXT:    v_readlane_b32 s4, v1, 44
-; GCN-NEXT:    v_readlane_b32 s5, v1, 45
-; GCN-NEXT:    v_readlane_b32 s6, v1, 46
-; GCN-NEXT:    v_readlane_b32 s7, v1, 47
+; GCN-NEXT:    v_readlane_b32 s0, v0, 40
+; GCN-NEXT:    v_readlane_b32 s1, v0, 41
+; GCN-NEXT:    v_readlane_b32 s2, v0, 42
+; GCN-NEXT:    v_readlane_b32 s3, v0, 43
+; GCN-NEXT:    v_readlane_b32 s4, v0, 44
+; GCN-NEXT:    v_readlane_b32 s5, v0, 45
+; GCN-NEXT:    v_readlane_b32 s6, v0, 46
+; GCN-NEXT:    v_readlane_b32 s7, v0, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-NEXT:    v_readlane_b32 s2, v0, 2
-; GCN-NEXT:    v_readlane_b32 s3, v0, 3
-; GCN-NEXT:    v_readlane_b32 s4, v0, 4
-; GCN-NEXT:    v_readlane_b32 s5, v0, 5
-; GCN-NEXT:    v_readlane_b32 s6, v0, 6
-; GCN-NEXT:    v_readlane_b32 s7, v0, 7
+; GCN-NEXT:    v_readlane_b32 s0, v2, 0
+; GCN-NEXT:    v_readlane_b32 s1, v2, 1
+; GCN-NEXT:    v_readlane_b32 s2, v2, 2
+; GCN-NEXT:    v_readlane_b32 s3, v2, 3
+; GCN-NEXT:    v_readlane_b32 s4, v2, 4
+; GCN-NEXT:    v_readlane_b32 s5, v2, 5
+; GCN-NEXT:    v_readlane_b32 s6, v2, 6
+; GCN-NEXT:    v_readlane_b32 s7, v2, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[84:91]
 ; GCN-NEXT:    ;;#ASMEND
@@ -420,18 +392,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr2
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -482,125 +442,104 @@ ret:
 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: split_sgpr_spill_2_vgprs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s54, -1
-; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
-; GCN-NEXT:    s_add_u32 s52, s52, s11
-; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
-; GCN-NEXT:    v_writelane_b32 v1, s12, 8
-; GCN-NEXT:    v_writelane_b32 v1, s13, 9
-; GCN-NEXT:    v_writelane_b32 v1, s14, 10
-; GCN-NEXT:    v_writelane_b32 v1, s15, 11
-; GCN-NEXT:    v_writelane_b32 v1, s16, 12
-; GCN-NEXT:    v_writelane_b32 v1, s17, 13
-; GCN-NEXT:    v_writelane_b32 v1, s18, 14
-; GCN-NEXT:    v_writelane_b32 v1, s19, 15
+; GCN-NEXT:    v_writelane_b32 v0, s4, 0
+; GCN-NEXT:    v_writelane_b32 v0, s5, 1
+; GCN-NEXT:    v_writelane_b32 v0, s6, 2
+; GCN-NEXT:    v_writelane_b32 v0, s7, 3
+; GCN-NEXT:    v_writelane_b32 v0, s8, 4
+; GCN-NEXT:    v_writelane_b32 v0, s9, 5
+; GCN-NEXT:    v_writelane_b32 v0, s10, 6
+; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    v_writelane_b32 v0, s12, 8
+; GCN-NEXT:    v_writelane_b32 v0, s13, 9
+; GCN-NEXT:    v_writelane_b32 v0, s14, 10
+; GCN-NEXT:    v_writelane_b32 v0, s15, 11
+; GCN-NEXT:    v_writelane_b32 v0, s16, 12
+; GCN-NEXT:    v_writelane_b32 v0, s17, 13
+; GCN-NEXT:    v_writelane_b32 v0, s18, 14
+; GCN-NEXT:    v_writelane_b32 v0, s19, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
-; GCN-NEXT:    v_writelane_b32 v1, s12, 24
-; GCN-NEXT:    v_writelane_b32 v1, s13, 25
-; GCN-NEXT:    v_writelane_b32 v1, s14, 26
-; GCN-NEXT:    v_writelane_b32 v1, s15, 27
-; GCN-NEXT:    v_writelane_b32 v1, s16, 28
-; GCN-NEXT:    v_writelane_b32 v1, s17, 29
-; GCN-NEXT:    v_writelane_b32 v1, s18, 30
-; GCN-NEXT:    v_writelane_b32 v1, s19, 31
+; GCN-NEXT:    v_writelane_b32 v0, s4, 16
+; GCN-NEXT:    v_writelane_b32 v0, s5, 17
+; GCN-NEXT:    v_writelane_b32 v0, s6, 18
+; GCN-NEXT:    v_writelane_b32 v0, s7, 19
+; GCN-NEXT:    v_writelane_b32 v0, s8, 20
+; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    v_writelane_b32 v0, s10, 22
+; GCN-NEXT:    v_writelane_b32 v0, s11, 23
+; GCN-NEXT:    v_writelane_b32 v0, s12, 24
+; GCN-NEXT:    v_writelane_b32 v0, s13, 25
+; GCN-NEXT:    v_writelane_b32 v0, s14, 26
+; GCN-NEXT:    v_writelane_b32 v0, s15, 27
+; GCN-NEXT:    v_writelane_b32 v0, s16, 28
+; GCN-NEXT:    v_writelane_b32 v0, s17, 29
+; GCN-NEXT:    v_writelane_b32 v0, s18, 30
+; GCN-NEXT:    v_writelane_b32 v0, s19, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
-; GCN-NEXT:    v_writelane_b32 v1, s12, 40
-; GCN-NEXT:    v_writelane_b32 v1, s13, 41
-; GCN-NEXT:    v_writelane_b32 v1, s14, 42
-; GCN-NEXT:    v_writelane_b32 v1, s15, 43
-; GCN-NEXT:    v_writelane_b32 v1, s16, 44
-; GCN-NEXT:    v_writelane_b32 v1, s17, 45
-; GCN-NEXT:    v_writelane_b32 v1, s18, 46
-; GCN-NEXT:    v_writelane_b32 v1, s19, 47
+; GCN-NEXT:    v_writelane_b32 v0, s4, 32
+; GCN-NEXT:    v_writelane_b32 v0, s5, 33
+; GCN-NEXT:    v_writelane_b32 v0, s6, 34
+; GCN-NEXT:    v_writelane_b32 v0, s7, 35
+; GCN-NEXT:    v_writelane_b32 v0, s8, 36
+; GCN-NEXT:    v_writelane_b32 v0, s9, 37
+; GCN-NEXT:    v_writelane_b32 v0, s10, 38
+; GCN-NEXT:    v_writelane_b32 v0, s11, 39
+; GCN-NEXT:    v_writelane_b32 v0, s12, 40
+; GCN-NEXT:    v_writelane_b32 v0, s13, 41
+; GCN-NEXT:    v_writelane_b32 v0, s14, 42
+; GCN-NEXT:    v_writelane_b32 v0, s15, 43
+; GCN-NEXT:    v_writelane_b32 v0, s16, 44
+; GCN-NEXT:    v_writelane_b32 v0, s17, 45
+; GCN-NEXT:    v_writelane_b32 v0, s18, 46
+; GCN-NEXT:    v_writelane_b32 v0, s19, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
-; GCN-NEXT:    v_writelane_b32 v1, s12, 56
-; GCN-NEXT:    v_writelane_b32 v1, s13, 57
-; GCN-NEXT:    v_writelane_b32 v1, s14, 58
-; GCN-NEXT:    v_writelane_b32 v1, s15, 59
-; GCN-NEXT:    v_writelane_b32 v1, s16, 60
-; GCN-NEXT:    v_writelane_b32 v1, s17, 61
-; GCN-NEXT:    v_writelane_b32 v1, s18, 62
-; GCN-NEXT:    v_writelane_b32 v1, s19, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
+; GCN-NEXT:    v_writelane_b32 v0, s4, 48
+; GCN-NEXT:    v_writelane_b32 v0, s5, 49
+; GCN-NEXT:    v_writelane_b32 v0, s6, 50
+; GCN-NEXT:    v_writelane_b32 v0, s7, 51
+; GCN-NEXT:    v_writelane_b32 v0, s8, 52
+; GCN-NEXT:    v_writelane_b32 v0, s9, 53
+; GCN-NEXT:    v_writelane_b32 v0, s10, 54
+; GCN-NEXT:    v_writelane_b32 v0, s11, 55
+; GCN-NEXT:    v_writelane_b32 v0, s12, 56
+; GCN-NEXT:    v_writelane_b32 v0, s13, 57
+; GCN-NEXT:    v_writelane_b32 v0, s14, 58
+; GCN-NEXT:    v_writelane_b32 v0, s15, 59
+; GCN-NEXT:    v_writelane_b32 v0, s16, 60
+; GCN-NEXT:    v_writelane_b32 v0, s17, 61
+; GCN-NEXT:    v_writelane_b32 v0, s18, 62
+; GCN-NEXT:    v_writelane_b32 v0, s19, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
-; GCN-NEXT:    v_writelane_b32 v0, s5, 1
-; GCN-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-NEXT:    v_writelane_b32 v0, s7, 3
-; GCN-NEXT:    v_writelane_b32 v0, s8, 4
-; GCN-NEXT:    v_writelane_b32 v0, s9, 5
-; GCN-NEXT:    v_writelane_b32 v0, s10, 6
-; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    v_writelane_b32 v1, s4, 0
+; GCN-NEXT:    v_writelane_b32 v1, s5, 1
+; GCN-NEXT:    v_writelane_b32 v1, s6, 2
+; GCN-NEXT:    v_writelane_b32 v1, s7, 3
+; GCN-NEXT:    v_writelane_b32 v1, s8, 4
+; GCN-NEXT:    v_writelane_b32 v1, s9, 5
+; GCN-NEXT:    v_writelane_b32 v1, s10, 6
+; GCN-NEXT:    v_writelane_b32 v1, s11, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s2, 8
-; GCN-NEXT:    v_writelane_b32 v0, s3, 9
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
+; GCN-NEXT:    v_writelane_b32 v1, s2, 8
+; GCN-NEXT:    v_writelane_b32 v1, s3, 9
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, s1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s16, v1, 8
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 9
 ; GCN-NEXT:    v_readlane_b32 s20, v1, 0
@@ -694,14 +633,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB1_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -738,17 +669,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s52, s52, s11
 ; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -762,176 +685,176 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
-; GCN-NEXT:    v_writelane_b32 v1, s12, 8
-; GCN-NEXT:    v_writelane_b32 v1, s13, 9
-; GCN-NEXT:    v_writelane_b32 v1, s14, 10
-; GCN-NEXT:    v_writelane_b32 v1, s15, 11
-; GCN-NEXT:    v_writelane_b32 v1, s16, 12
-; GCN-NEXT:    v_writelane_b32 v1, s17, 13
-; GCN-NEXT:    v_writelane_b32 v1, s18, 14
-; GCN-NEXT:    v_writelane_b32 v1, s19, 15
+; GCN-NEXT:    v_writelane_b32 v31, s4, 0
+; GCN-NEXT:    v_writelane_b32 v31, s5, 1
+; GCN-NEXT:    v_writelane_b32 v31, s6, 2
+; GCN-NEXT:    v_writelane_b32 v31, s7, 3
+; GCN-NEXT:    v_writelane_b32 v31, s8, 4
+; GCN-NEXT:    v_writelane_b32 v31, s9, 5
+; GCN-NEXT:    v_writelane_b32 v31, s10, 6
+; GCN-NEXT:    v_writelane_b32 v31, s11, 7
+; GCN-NEXT:    v_writelane_b32 v31, s12, 8
+; GCN-NEXT:    v_writelane_b32 v31, s13, 9
+; GCN-NEXT:    v_writelane_b32 v31, s14, 10
+; GCN-NEXT:    v_writelane_b32 v31, s15, 11
+; GCN-NEXT:    v_writelane_b32 v31, s16, 12
+; GCN-NEXT:    v_writelane_b32 v31, s17, 13
+; GCN-NEXT:    v_writelane_b32 v31, s18, 14
+; GCN-NEXT:    v_writelane_b32 v31, s19, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
-; GCN-NEXT:    v_writelane_b32 v1, s12, 24
-; GCN-NEXT:    v_writelane_b32 v1, s13, 25
-; GCN-NEXT:    v_writelane_b32 v1, s14, 26
-; GCN-NEXT:    v_writelane_b32 v1, s15, 27
-; GCN-NEXT:    v_writelane_b32 v1, s16, 28
-; GCN-NEXT:    v_writelane_b32 v1, s17, 29
-; GCN-NEXT:    v_writelane_b32 v1, s18, 30
-; GCN-NEXT:    v_writelane_b32 v1, s19, 31
+; GCN-NEXT:    v_writelane_b32 v31, s4, 16
+; GCN-NEXT:    v_writelane_b32 v31, s5, 17
+; GCN-NEXT:    v_writelane_b32 v31, s6, 18
+; GCN-NEXT:    v_writelane_b32 v31, s7, 19
+; GCN-NEXT:    v_writelane_b32 v31, s8, 20
+; GCN-NEXT:    v_writelane_b32 v31, s9, 21
+; GCN-NEXT:    v_writelane_b32 v31, s10, 22
+; GCN-NEXT:    v_writelane_b32 v31, s11, 23
+; GCN-NEXT:    v_writelane_b32 v31, s12, 24
+; GCN-NEXT:    v_writelane_b32 v31, s13, 25
+; GCN-NEXT:    v_writelane_b32 v31, s14, 26
+; GCN-NEXT:    v_writelane_b32 v31, s15, 27
+; GCN-NEXT:    v_writelane_b32 v31, s16, 28
+; GCN-NEXT:    v_writelane_b32 v31, s17, 29
+; GCN-NEXT:    v_writelane_b32 v31, s18, 30
+; GCN-NEXT:    v_writelane_b32 v31, s19, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
-; GCN-NEXT:    v_writelane_b32 v1, s12, 40
-; GCN-NEXT:    v_writelane_b32 v1, s13, 41
-; GCN-NEXT:    v_writelane_b32 v1, s14, 42
-; GCN-NEXT:    v_writelane_b32 v1, s15, 43
-; GCN-NEXT:    v_writelane_b32 v1, s16, 44
-; GCN-NEXT:    v_writelane_b32 v1, s17, 45
-; GCN-NEXT:    v_writelane_b32 v1, s18, 46
-; GCN-NEXT:    v_writelane_b32 v1, s19, 47
+; GCN-NEXT:    v_writelane_b32 v31, s4, 32
+; GCN-NEXT:    v_writelane_b32 v31, s5, 33
+; GCN-NEXT:    v_writelane_b32 v31, s6, 34
+; GCN-NEXT:    v_writelane_b32 v31, s7, 35
+; GCN-NEXT:    v_writelane_b32 v31, s8, 36
+; GCN-NEXT:    v_writelane_b32 v31, s9, 37
+; GCN-NEXT:    v_writelane_b32 v31, s10, 38
+; GCN-NEXT:    v_writelane_b32 v31, s11, 39
+; GCN-NEXT:    v_writelane_b32 v31, s12, 40
+; GCN-NEXT:    v_writelane_b32 v31, s13, 41
+; GCN-NEXT:    v_writelane_b32 v31, s14, 42
+; GCN-NEXT:    v_writelane_b32 v31, s15, 43
+; GCN-NEXT:    v_writelane_b32 v31, s16, 44
+; GCN-NEXT:    v_writelane_b32 v31, s17, 45
+; GCN-NEXT:    v_writelane_b32 v31, s18, 46
+; GCN-NEXT:    v_writelane_b32 v31, s19, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
-; GCN-NEXT:    v_writelane_b32 v1, s12, 56
-; GCN-NEXT:    v_writelane_b32 v1, s13, 57
-; GCN-NEXT:    v_writelane_b32 v1, s14, 58
-; GCN-NEXT:    v_writelane_b32 v1, s15, 59
-; GCN-NEXT:    v_writelane_b32 v1, s16, 60
-; GCN-NEXT:    v_writelane_b32 v1, s17, 61
-; GCN-NEXT:    v_writelane_b32 v1, s18, 62
-; GCN-NEXT:    v_writelane_b32 v1, s19, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_writelane_b32 v31, s4, 48
+; GCN-NEXT:    v_writelane_b32 v31, s5, 49
+; GCN-NEXT:    v_writelane_b32 v31, s6, 50
+; GCN-NEXT:    v_writelane_b32 v31, s7, 51
+; GCN-NEXT:    v_writelane_b32 v31, s8, 52
+; GCN-NEXT:    v_writelane_b32 v31, s9, 53
+; GCN-NEXT:    v_writelane_b32 v31, s10, 54
+; GCN-NEXT:    v_writelane_b32 v31, s11, 55
+; GCN-NEXT:    v_writelane_b32 v31, s12, 56
+; GCN-NEXT:    v_writelane_b32 v31, s13, 57
+; GCN-NEXT:    v_writelane_b32 v31, s14, 58
+; GCN-NEXT:    v_writelane_b32 v31, s15, 59
+; GCN-NEXT:    v_writelane_b32 v31, s16, 60
+; GCN-NEXT:    v_writelane_b32 v31, s17, 61
+; GCN-NEXT:    v_writelane_b32 v31, s18, 62
+; GCN-NEXT:    v_writelane_b32 v31, s19, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, s1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s36, v1, 32
-; GCN-NEXT:    v_readlane_b32 s37, v1, 33
-; GCN-NEXT:    v_readlane_b32 s38, v1, 34
-; GCN-NEXT:    v_readlane_b32 s39, v1, 35
-; GCN-NEXT:    v_readlane_b32 s40, v1, 36
-; GCN-NEXT:    v_readlane_b32 s41, v1, 37
-; GCN-NEXT:    v_readlane_b32 s42, v1, 38
-; GCN-NEXT:    v_readlane_b32 s43, v1, 39
-; GCN-NEXT:    v_readlane_b32 s44, v1, 40
-; GCN-NEXT:    v_readlane_b32 s45, v1, 41
-; GCN-NEXT:    v_readlane_b32 s46, v1, 42
-; GCN-NEXT:    v_readlane_b32 s47, v1, 43
-; GCN-NEXT:    v_readlane_b32 s48, v1, 44
-; GCN-NEXT:    v_readlane_b32 s49, v1, 45
-; GCN-NEXT:    v_readlane_b32 s50, v1, 46
-; GCN-NEXT:    v_readlane_b32 s51, v1, 47
-; GCN-NEXT:    v_readlane_b32 s0, v1, 16
-; GCN-NEXT:    v_readlane_b32 s1, v1, 17
-; GCN-NEXT:    v_readlane_b32 s2, v1, 18
-; GCN-NEXT:    v_readlane_b32 s3, v1, 19
-; GCN-NEXT:    v_readlane_b32 s4, v1, 20
-; GCN-NEXT:    v_readlane_b32 s5, v1, 21
-; GCN-NEXT:    v_readlane_b32 s6, v1, 22
-; GCN-NEXT:    v_readlane_b32 s7, v1, 23
-; GCN-NEXT:    v_readlane_b32 s8, v1, 24
-; GCN-NEXT:    v_readlane_b32 s9, v1, 25
-; GCN-NEXT:    v_readlane_b32 s10, v1, 26
-; GCN-NEXT:    v_readlane_b32 s11, v1, 27
-; GCN-NEXT:    v_readlane_b32 s12, v1, 28
-; GCN-NEXT:    v_readlane_b32 s13, v1, 29
-; GCN-NEXT:    v_readlane_b32 s14, v1, 30
-; GCN-NEXT:    v_readlane_b32 s15, v1, 31
-; GCN-NEXT:    v_readlane_b32 s16, v1, 0
-; GCN-NEXT:    v_readlane_b32 s17, v1, 1
-; GCN-NEXT:    v_readlane_b32 s18, v1, 2
-; GCN-NEXT:    v_readlane_b32 s19, v1, 3
-; GCN-NEXT:    v_readlane_b32 s20, v1, 4
-; GCN-NEXT:    v_readlane_b32 s21, v1, 5
-; GCN-NEXT:    v_readlane_b32 s22, v1, 6
-; GCN-NEXT:    v_readlane_b32 s23, v1, 7
-; GCN-NEXT:    v_readlane_b32 s24, v1, 8
-; GCN-NEXT:    v_readlane_b32 s25, v1, 9
-; GCN-NEXT:    v_readlane_b32 s26, v1, 10
-; GCN-NEXT:    v_readlane_b32 s27, v1, 11
-; GCN-NEXT:    v_readlane_b32 s28, v1, 12
-; GCN-NEXT:    v_readlane_b32 s29, v1, 13
-; GCN-NEXT:    v_readlane_b32 s30, v1, 14
-; GCN-NEXT:    v_readlane_b32 s31, v1, 15
+; GCN-NEXT:    v_readlane_b32 s36, v31, 32
+; GCN-NEXT:    v_readlane_b32 s37, v31, 33
+; GCN-NEXT:    v_readlane_b32 s38, v31, 34
+; GCN-NEXT:    v_readlane_b32 s39, v31, 35
+; GCN-NEXT:    v_readlane_b32 s40, v31, 36
+; GCN-NEXT:    v_readlane_b32 s41, v31, 37
+; GCN-NEXT:    v_readlane_b32 s42, v31, 38
+; GCN-NEXT:    v_readlane_b32 s43, v31, 39
+; GCN-NEXT:    v_readlane_b32 s44, v31, 40
+; GCN-NEXT:    v_readlane_b32 s45, v31, 41
+; GCN-NEXT:    v_readlane_b32 s46, v31, 42
+; GCN-NEXT:    v_readlane_b32 s47, v31, 43
+; GCN-NEXT:    v_readlane_b32 s48, v31, 44
+; GCN-NEXT:    v_readlane_b32 s49, v31, 45
+; GCN-NEXT:    v_readlane_b32 s50, v31, 46
+; GCN-NEXT:    v_readlane_b32 s51, v31, 47
+; GCN-NEXT:    v_readlane_b32 s0, v31, 16
+; GCN-NEXT:    v_readlane_b32 s1, v31, 17
+; GCN-NEXT:    v_readlane_b32 s2, v31, 18
+; GCN-NEXT:    v_readlane_b32 s3, v31, 19
+; GCN-NEXT:    v_readlane_b32 s4, v31, 20
+; GCN-NEXT:    v_readlane_b32 s5, v31, 21
+; GCN-NEXT:    v_readlane_b32 s6, v31, 22
+; GCN-NEXT:    v_readlane_b32 s7, v31, 23
+; GCN-NEXT:    v_readlane_b32 s8, v31, 24
+; GCN-NEXT:    v_readlane_b32 s9, v31, 25
+; GCN-NEXT:    v_readlane_b32 s10, v31, 26
+; GCN-NEXT:    v_readlane_b32 s11, v31, 27
+; GCN-NEXT:    v_readlane_b32 s12, v31, 28
+; GCN-NEXT:    v_readlane_b32 s13, v31, 29
+; GCN-NEXT:    v_readlane_b32 s14, v31, 30
+; GCN-NEXT:    v_readlane_b32 s15, v31, 31
+; GCN-NEXT:    v_readlane_b32 s16, v31, 0
+; GCN-NEXT:    v_readlane_b32 s17, v31, 1
+; GCN-NEXT:    v_readlane_b32 s18, v31, 2
+; GCN-NEXT:    v_readlane_b32 s19, v31, 3
+; GCN-NEXT:    v_readlane_b32 s20, v31, 4
+; GCN-NEXT:    v_readlane_b32 s21, v31, 5
+; GCN-NEXT:    v_readlane_b32 s22, v31, 6
+; GCN-NEXT:    v_readlane_b32 s23, v31, 7
+; GCN-NEXT:    v_readlane_b32 s24, v31, 8
+; GCN-NEXT:    v_readlane_b32 s25, v31, 9
+; GCN-NEXT:    v_readlane_b32 s26, v31, 10
+; GCN-NEXT:    v_readlane_b32 s27, v31, 11
+; GCN-NEXT:    v_readlane_b32 s28, v31, 12
+; GCN-NEXT:    v_readlane_b32 s29, v31, 13
+; GCN-NEXT:    v_readlane_b32 s30, v31, 14
+; GCN-NEXT:    v_readlane_b32 s31, v31, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 48
-; GCN-NEXT:    v_readlane_b32 s5, v1, 49
-; GCN-NEXT:    v_readlane_b32 s6, v1, 50
-; GCN-NEXT:    v_readlane_b32 s7, v1, 51
-; GCN-NEXT:    v_readlane_b32 s8, v1, 52
-; GCN-NEXT:    v_readlane_b32 s9, v1, 53
-; GCN-NEXT:    v_readlane_b32 s10, v1, 54
-; GCN-NEXT:    v_readlane_b32 s11, v1, 55
-; GCN-NEXT:    v_readlane_b32 s12, v1, 56
-; GCN-NEXT:    v_readlane_b32 s13, v1, 57
-; GCN-NEXT:    v_readlane_b32 s14, v1, 58
-; GCN-NEXT:    v_readlane_b32 s15, v1, 59
-; GCN-NEXT:    v_readlane_b32 s16, v1, 60
-; GCN-NEXT:    v_readlane_b32 s17, v1, 61
-; GCN-NEXT:    v_readlane_b32 s18, v1, 62
-; GCN-NEXT:    v_readlane_b32 s19, v1, 63
+; GCN-NEXT:    v_readlane_b32 s4, v31, 48
+; GCN-NEXT:    v_readlane_b32 s5, v31, 49
+; GCN-NEXT:    v_readlane_b32 s6, v31, 50
+; GCN-NEXT:    v_readlane_b32 s7, v31, 51
+; GCN-NEXT:    v_readlane_b32 s8, v31, 52
+; GCN-NEXT:    v_readlane_b32 s9, v31, 53
+; GCN-NEXT:    v_readlane_b32 s10, v31, 54
+; GCN-NEXT:    v_readlane_b32 s11, v31, 55
+; GCN-NEXT:    v_readlane_b32 s12, v31, 56
+; GCN-NEXT:    v_readlane_b32 s13, v31, 57
+; GCN-NEXT:    v_readlane_b32 s14, v31, 58
+; GCN-NEXT:    v_readlane_b32 s15, v31, 59
+; GCN-NEXT:    v_readlane_b32 s16, v31, 60
+; GCN-NEXT:    v_readlane_b32 s17, v31, 61
+; GCN-NEXT:    v_readlane_b32 s18, v31, 62
+; GCN-NEXT:    v_readlane_b32 s19, v31, 63
+; GCN-NEXT:    s_mov_b64 s[2:3], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
@@ -942,14 +865,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ; use s[0:1]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB2_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
@@ -989,17 +904,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s52, s52, s11
 ; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1013,152 +920,144 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
-; GCN-NEXT:    v_writelane_b32 v1, s12, 8
-; GCN-NEXT:    v_writelane_b32 v1, s13, 9
-; GCN-NEXT:    v_writelane_b32 v1, s14, 10
-; GCN-NEXT:    v_writelane_b32 v1, s15, 11
-; GCN-NEXT:    v_writelane_b32 v1, s16, 12
-; GCN-NEXT:    v_writelane_b32 v1, s17, 13
-; GCN-NEXT:    v_writelane_b32 v1, s18, 14
-; GCN-NEXT:    v_writelane_b32 v1, s19, 15
+; GCN-NEXT:    v_writelane_b32 v31, s4, 0
+; GCN-NEXT:    v_writelane_b32 v31, s5, 1
+; GCN-NEXT:    v_writelane_b32 v31, s6, 2
+; GCN-NEXT:    v_writelane_b32 v31, s7, 3
+; GCN-NEXT:    v_writelane_b32 v31, s8, 4
+; GCN-NEXT:    v_writelane_b32 v31, s9, 5
+; GCN-NEXT:    v_writelane_b32 v31, s10, 6
+; GCN-NEXT:    v_writelane_b32 v31, s11, 7
+; GCN-NEXT:    v_writelane_b32 v31, s12, 8
+; GCN-NEXT:    v_writelane_b32 v31, s13, 9
+; GCN-NEXT:    v_writelane_b32 v31, s14, 10
+; GCN-NEXT:    v_writelane_b32 v31, s15, 11
+; GCN-NEXT:    v_writelane_b32 v31, s16, 12
+; GCN-NEXT:    v_writelane_b32 v31, s17, 13
+; GCN-NEXT:    v_writelane_b32 v31, s18, 14
+; GCN-NEXT:    v_writelane_b32 v31, s19, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
-; GCN-NEXT:    v_writelane_b32 v1, s12, 24
-; GCN-NEXT:    v_writelane_b32 v1, s13, 25
-; GCN-NEXT:    v_writelane_b32 v1, s14, 26
-; GCN-NEXT:    v_writelane_b32 v1, s15, 27
-; GCN-NEXT:    v_writelane_b32 v1, s16, 28
-; GCN-NEXT:    v_writelane_b32 v1, s17, 29
-; GCN-NEXT:    v_writelane_b32 v1, s18, 30
-; GCN-NEXT:    v_writelane_b32 v1, s19, 31
+; GCN-NEXT:    v_writelane_b32 v31, s4, 16
+; GCN-NEXT:    v_writelane_b32 v31, s5, 17
+; GCN-NEXT:    v_writelane_b32 v31, s6, 18
+; GCN-NEXT:    v_writelane_b32 v31, s7, 19
+; GCN-NEXT:    v_writelane_b32 v31, s8, 20
+; GCN-NEXT:    v_writelane_b32 v31, s9, 21
+; GCN-NEXT:    v_writelane_b32 v31, s10, 22
+; GCN-NEXT:    v_writelane_b32 v31, s11, 23
+; GCN-NEXT:    v_writelane_b32 v31, s12, 24
+; GCN-NEXT:    v_writelane_b32 v31, s13, 25
+; GCN-NEXT:    v_writelane_b32 v31, s14, 26
+; GCN-NEXT:    v_writelane_b32 v31, s15, 27
+; GCN-NEXT:    v_writelane_b32 v31, s16, 28
+; GCN-NEXT:    v_writelane_b32 v31, s17, 29
+; GCN-NEXT:    v_writelane_b32 v31, s18, 30
+; GCN-NEXT:    v_writelane_b32 v31, s19, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
-; GCN-NEXT:    v_writelane_b32 v1, s12, 40
-; GCN-NEXT:    v_writelane_b32 v1, s13, 41
-; GCN-NEXT:    v_writelane_b32 v1, s14, 42
-; GCN-NEXT:    v_writelane_b32 v1, s15, 43
-; GCN-NEXT:    v_writelane_b32 v1, s16, 44
-; GCN-NEXT:    v_writelane_b32 v1, s17, 45
-; GCN-NEXT:    v_writelane_b32 v1, s18, 46
-; GCN-NEXT:    v_writelane_b32 v1, s19, 47
+; GCN-NEXT:    v_writelane_b32 v31, s4, 32
+; GCN-NEXT:    v_writelane_b32 v31, s5, 33
+; GCN-NEXT:    v_writelane_b32 v31, s6, 34
+; GCN-NEXT:    v_writelane_b32 v31, s7, 35
+; GCN-NEXT:    v_writelane_b32 v31, s8, 36
+; GCN-NEXT:    v_writelane_b32 v31, s9, 37
+; GCN-NEXT:    v_writelane_b32 v31, s10, 38
+; GCN-NEXT:    v_writelane_b32 v31, s11, 39
+; GCN-NEXT:    v_writelane_b32 v31, s12, 40
+; GCN-NEXT:    v_writelane_b32 v31, s13, 41
+; GCN-NEXT:    v_writelane_b32 v31, s14, 42
+; GCN-NEXT:    v_writelane_b32 v31, s15, 43
+; GCN-NEXT:    v_writelane_b32 v31, s16, 44
+; GCN-NEXT:    v_writelane_b32 v31, s17, 45
+; GCN-NEXT:    v_writelane_b32 v31, s18, 46
+; GCN-NEXT:    v_writelane_b32 v31, s19, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
-; GCN-NEXT:    v_writelane_b32 v1, s12, 56
-; GCN-NEXT:    v_writelane_b32 v1, s13, 57
-; GCN-NEXT:    v_writelane_b32 v1, s14, 58
-; GCN-NEXT:    v_writelane_b32 v1, s15, 59
-; GCN-NEXT:    v_writelane_b32 v1, s16, 60
-; GCN-NEXT:    v_writelane_b32 v1, s17, 61
-; GCN-NEXT:    v_writelane_b32 v1, s18, 62
-; GCN-NEXT:    v_writelane_b32 v1, s19, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_writelane_b32 v31, s4, 48
+; GCN-NEXT:    v_writelane_b32 v31, s5, 49
+; GCN-NEXT:    v_writelane_b32 v31, s6, 50
+; GCN-NEXT:    v_writelane_b32 v31, s7, 51
+; GCN-NEXT:    v_writelane_b32 v31, s8, 52
+; GCN-NEXT:    v_writelane_b32 v31, s9, 53
+; GCN-NEXT:    v_writelane_b32 v31, s10, 54
+; GCN-NEXT:    v_writelane_b32 v31, s11, 55
+; GCN-NEXT:    v_writelane_b32 v31, s12, 56
+; GCN-NEXT:    v_writelane_b32 v31, s13, 57
+; GCN-NEXT:    v_writelane_b32 v31, s14, 58
+; GCN-NEXT:    v_writelane_b32 v31, s15, 59
+; GCN-NEXT:    v_writelane_b32 v31, s16, 60
+; GCN-NEXT:    v_writelane_b32 v31, s17, 61
+; GCN-NEXT:    v_writelane_b32 v31, s18, 62
+; GCN-NEXT:    v_writelane_b32 v31, s19, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, s1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s36, v2, 32
-; GCN-NEXT:    v_readlane_b32 s37, v2, 33
-; GCN-NEXT:    v_readlane_b32 s38, v2, 34
-; GCN-NEXT:    v_readlane_b32 s39, v2, 35
-; GCN-NEXT:    v_readlane_b32 s40, v2, 36
-; GCN-NEXT:    v_readlane_b32 s41, v2, 37
-; GCN-NEXT:    v_readlane_b32 s42, v2, 38
-; GCN-NEXT:    v_readlane_b32 s43, v2, 39
-; GCN-NEXT:    v_readlane_b32 s44, v2, 40
-; GCN-NEXT:    v_readlane_b32 s45, v2, 41
-; GCN-NEXT:    v_readlane_b32 s46, v2, 42
-; GCN-NEXT:    v_readlane_b32 s47, v2, 43
-; GCN-NEXT:    v_readlane_b32 s48, v2, 44
-; GCN-NEXT:    v_readlane_b32 s49, v2, 45
-; GCN-NEXT:    v_readlane_b32 s50, v2, 46
-; GCN-NEXT:    v_readlane_b32 s51, v2, 47
-; GCN-NEXT:    v_readlane_b32 s0, v2, 16
-; GCN-NEXT:    v_readlane_b32 s1, v2, 17
-; GCN-NEXT:    v_readlane_b32 s2, v2, 18
-; GCN-NEXT:    v_readlane_b32 s3, v2, 19
-; GCN-NEXT:    v_readlane_b32 s4, v2, 20
-; GCN-NEXT:    v_readlane_b32 s5, v2, 21
-; GCN-NEXT:    v_readlane_b32 s6, v2, 22
-; GCN-NEXT:    v_readlane_b32 s7, v2, 23
-; GCN-NEXT:    v_readlane_b32 s8, v2, 24
-; GCN-NEXT:    v_readlane_b32 s9, v2, 25
-; GCN-NEXT:    v_readlane_b32 s10, v2, 26
-; GCN-NEXT:    v_readlane_b32 s11, v2, 27
-; GCN-NEXT:    v_readlane_b32 s12, v2, 28
-; GCN-NEXT:    v_readlane_b32 s13, v2, 29
-; GCN-NEXT:    v_readlane_b32 s14, v2, 30
-; GCN-NEXT:    v_readlane_b32 s15, v2, 31
-; GCN-NEXT:    v_readlane_b32 s16, v2, 0
-; GCN-NEXT:    v_readlane_b32 s17, v2, 1
-; GCN-NEXT:    v_readlane_b32 s18, v2, 2
-; GCN-NEXT:    v_readlane_b32 s19, v2, 3
-; GCN-NEXT:    v_readlane_b32 s20, v2, 4
-; GCN-NEXT:    v_readlane_b32 s21, v2, 5
-; GCN-NEXT:    v_readlane_b32 s22, v2, 6
-; GCN-NEXT:    v_readlane_b32 s23, v2, 7
-; GCN-NEXT:    v_readlane_b32 s24, v2, 8
-; GCN-NEXT:    v_readlane_b32 s25, v2, 9
-; GCN-NEXT:    v_readlane_b32 s26, v2, 10
-; GCN-NEXT:    v_readlane_b32 s27, v2, 11
-; GCN-NEXT:    v_readlane_b32 s28, v2, 12
-; GCN-NEXT:    v_readlane_b32 s29, v2, 13
-; GCN-NEXT:    v_readlane_b32 s30, v2, 14
-; GCN-NEXT:    v_readlane_b32 s31, v2, 15
+; GCN-NEXT:    v_readlane_b32 s36, v31, 32
+; GCN-NEXT:    v_readlane_b32 s37, v31, 33
+; GCN-NEXT:    v_readlane_b32 s38, v31, 34
+; GCN-NEXT:    v_readlane_b32 s39, v31, 35
+; GCN-NEXT:    v_readlane_b32 s40, v31, 36
+; GCN-NEXT:    v_readlane_b32 s41, v31, 37
+; GCN-NEXT:    v_readlane_b32 s42, v31, 38
+; GCN-NEXT:    v_readlane_b32 s43, v31, 39
+; GCN-NEXT:    v_readlane_b32 s44, v31, 40
+; GCN-NEXT:    v_readlane_b32 s45, v31, 41
+; GCN-NEXT:    v_readlane_b32 s46, v31, 42
+; GCN-NEXT:    v_readlane_b32 s47, v31, 43
+; GCN-NEXT:    v_readlane_b32 s48, v31, 44
+; GCN-NEXT:    v_readlane_b32 s49, v31, 45
+; GCN-NEXT:    v_readlane_b32 s50, v31, 46
+; GCN-NEXT:    v_readlane_b32 s51, v31, 47
+; GCN-NEXT:    v_readlane_b32 s0, v31, 16
+; GCN-NEXT:    v_readlane_b32 s1, v31, 17
+; GCN-NEXT:    v_readlane_b32 s2, v31, 18
+; GCN-NEXT:    v_readlane_b32 s3, v31, 19
+; GCN-NEXT:    v_readlane_b32 s4, v31, 20
+; GCN-NEXT:    v_readlane_b32 s5, v31, 21
+; GCN-NEXT:    v_readlane_b32 s6, v31, 22
+; GCN-NEXT:    v_readlane_b32 s7, v31, 23
+; GCN-NEXT:    v_readlane_b32 s8, v31, 24
+; GCN-NEXT:    v_readlane_b32 s9, v31, 25
+; GCN-NEXT:    v_readlane_b32 s10, v31, 26
+; GCN-NEXT:    v_readlane_b32 s11, v31, 27
+; GCN-NEXT:    v_readlane_b32 s12, v31, 28
+; GCN-NEXT:    v_readlane_b32 s13, v31, 29
+; GCN-NEXT:    v_readlane_b32 s14, v31, 30
+; GCN-NEXT:    v_readlane_b32 s15, v31, 31
+; GCN-NEXT:    v_readlane_b32 s16, v31, 0
+; GCN-NEXT:    v_readlane_b32 s17, v31, 1
+; GCN-NEXT:    v_readlane_b32 s18, v31, 2
+; GCN-NEXT:    v_readlane_b32 s19, v31, 3
+; GCN-NEXT:    v_readlane_b32 s20, v31, 4
+; GCN-NEXT:    v_readlane_b32 s21, v31, 5
+; GCN-NEXT:    v_readlane_b32 s22, v31, 6
+; GCN-NEXT:    v_readlane_b32 s23, v31, 7
+; GCN-NEXT:    v_readlane_b32 s24, v31, 8
+; GCN-NEXT:    v_readlane_b32 s25, v31, 9
+; GCN-NEXT:    v_readlane_b32 s26, v31, 10
+; GCN-NEXT:    v_readlane_b32 s27, v31, 11
+; GCN-NEXT:    v_readlane_b32 s28, v31, 12
+; GCN-NEXT:    v_readlane_b32 s29, v31, 13
+; GCN-NEXT:    v_readlane_b32 s30, v31, 14
+; GCN-NEXT:    v_readlane_b32 s31, v31, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -1168,24 +1067,32 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v2, 48
-; GCN-NEXT:    v_readlane_b32 s5, v2, 49
-; GCN-NEXT:    v_readlane_b32 s6, v2, 50
-; GCN-NEXT:    v_readlane_b32 s7, v2, 51
-; GCN-NEXT:    v_readlane_b32 s8, v2, 52
-; GCN-NEXT:    v_readlane_b32 s9, v2, 53
-; GCN-NEXT:    v_readlane_b32 s10, v2, 54
-; GCN-NEXT:    v_readlane_b32 s11, v2, 55
-; GCN-NEXT:    v_readlane_b32 s12, v2, 56
-; GCN-NEXT:    v_readlane_b32 s13, v2, 57
-; GCN-NEXT:    v_readlane_b32 s14, v2, 58
-; GCN-NEXT:    v_readlane_b32 s15, v2, 59
-; GCN-NEXT:    v_readlane_b32 s16, v2, 60
-; GCN-NEXT:    v_readlane_b32 s17, v2, 61
-; GCN-NEXT:    v_readlane_b32 s18, v2, 62
-; GCN-NEXT:    v_readlane_b32 s19, v2, 63
+; GCN-NEXT:    v_readlane_b32 s4, v31, 48
+; GCN-NEXT:    v_readlane_b32 s5, v31, 49
+; GCN-NEXT:    v_readlane_b32 s6, v31, 50
+; GCN-NEXT:    v_readlane_b32 s7, v31, 51
+; GCN-NEXT:    v_readlane_b32 s8, v31, 52
+; GCN-NEXT:    v_readlane_b32 s9, v31, 53
+; GCN-NEXT:    v_readlane_b32 s10, v31, 54
+; GCN-NEXT:    v_readlane_b32 s11, v31, 55
+; GCN-NEXT:    v_readlane_b32 s12, v31, 56
+; GCN-NEXT:    v_readlane_b32 s13, v31, 57
+; GCN-NEXT:    v_readlane_b32 s14, v31, 58
+; GCN-NEXT:    v_readlane_b32 s15, v31, 59
+; GCN-NEXT:    v_readlane_b32 s16, v31, 60
+; GCN-NEXT:    v_readlane_b32 s17, v31, 61
+; GCN-NEXT:    v_readlane_b32 s18, v31, 62
+; GCN-NEXT:    v_readlane_b32 s19, v31, 63
+; GCN-NEXT:    s_mov_b64 s[2:3], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
@@ -1199,14 +1106,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0

diff  --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
deleted file mode 100644
index 8f74132271ba68..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ /dev/null
@@ -1,816 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-
-; Due to high register pressure, regalloc would split the liverange of wwm VGPR register used for SGPR spills
-; and introduce a copy. The copy should be of whole-wave with exec mask manipulation around it.
-; FIXME: The destination register involved in the whole-wave copy should be considered for preserving all the lanes
-; with a spill/restore at function prolog/epilog. The copy might otherwise clobber its inactive lanes unwantedly.
-define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
-; GFX906-LABEL: preserve_wwm_copy_dstreg:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_mov_b32 s16, s33
-; GFX906-NEXT:    s_mov_b32 s33, s32
-; GFX906-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX906-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_mov_b64 exec, -1
-; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX906-NEXT:    s_mov_b32 s21, s15
-; GFX906-NEXT:    ; implicit-def: $vgpr2
-; GFX906-NEXT:    s_mov_b32 s22, s14
-; GFX906-NEXT:    v_writelane_b32 v2, s21, 0
-; GFX906-NEXT:    v_writelane_b32 v2, s22, 1
-; GFX906-NEXT:    s_mov_b32 s23, s13
-; GFX906-NEXT:    v_writelane_b32 v2, s23, 2
-; GFX906-NEXT:    s_mov_b32 s24, s12
-; GFX906-NEXT:    v_writelane_b32 v2, s24, 3
-; GFX906-NEXT:    s_mov_b64 s[26:27], s[10:11]
-; GFX906-NEXT:    v_writelane_b32 v2, s26, 4
-; GFX906-NEXT:    v_writelane_b32 v2, s27, 5
-; GFX906-NEXT:    v_writelane_b32 v2, s8, 6
-; GFX906-NEXT:    v_writelane_b32 v2, s9, 7
-; GFX906-NEXT:    v_writelane_b32 v2, s6, 8
-; GFX906-NEXT:    v_writelane_b32 v41, s16, 2
-; GFX906-NEXT:    v_writelane_b32 v2, s7, 9
-; GFX906-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX906-NEXT:    v_writelane_b32 v2, s4, 10
-; GFX906-NEXT:    s_addk_i32 s32, 0x2800
-; GFX906-NEXT:    v_writelane_b32 v41, s31, 1
-; GFX906-NEXT:    v_mov_b32_e32 v32, v31
-; GFX906-NEXT:    v_writelane_b32 v2, s5, 11
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    v_mov_b32_e32 v33, v2
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def v[0:31]
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def v40
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s11
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    v_mov_b32_e32 v40, v33
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_writelane_b32 v40, s11, 12
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s12
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s12, 13
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s13
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s13, 14
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s14
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s14, 15
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s15
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s15, 16
-; GFX906-NEXT:    s_getpc_b64 s[10:11]
-; GFX906-NEXT:    s_add_u32 s10, s10, foo at gotpcrel32@lo+4
-; GFX906-NEXT:    s_addc_u32 s11, s11, foo at gotpcrel32@hi+12
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s16
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s16, 17
-; GFX906-NEXT:    s_load_dwordx2 s[10:11], s[10:11], 0x0
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s17
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s17, 18
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s18
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s18, 19
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s19
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s19, 20
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s20
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s20, 21
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    v_writelane_b32 v40, s10, 22
-; GFX906-NEXT:    v_writelane_b32 v40, s11, 23
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX906-NEXT:    s_mov_b32 s12, s24
-; GFX906-NEXT:    s_mov_b32 s13, s23
-; GFX906-NEXT:    s_mov_b32 s14, s22
-; GFX906-NEXT:    v_mov_b32_e32 v31, v32
-; GFX906-NEXT:    s_mov_b32 s15, s21
-; GFX906-NEXT:    s_mov_b64 s[10:11], s[26:27]
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX906-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 12
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s11
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 13
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s12
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 14
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s13
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 15
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s14
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 16
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s15
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 17
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s16
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 18
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s17
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s18, v40, 19
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s18
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s19, v40, 20
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s19
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s20, v40, 21
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s20
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s21
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s22
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s23
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s24
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s25
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s26
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s27
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s28
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; def s29
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX906-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX906-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX906-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX906-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX906-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX906-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX906-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX906-NEXT:    v_writelane_b32 v40, s29, 32
-; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX906-NEXT:    v_readlane_b32 s6, v40, 8
-; GFX906-NEXT:    v_readlane_b32 s8, v40, 6
-; GFX906-NEXT:    v_readlane_b32 s10, v40, 4
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 3
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 2
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 1
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 0
-; GFX906-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX906-NEXT:    v_readlane_b32 s7, v40, 9
-; GFX906-NEXT:    v_readlane_b32 s9, v40, 7
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 5
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s21, v40, 24
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s21
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s22, v40, 25
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s22
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s23, v40, 26
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s23
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s24, v40, 27
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s24
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s25, v40, 28
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s25
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s26, v40, 29
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s26
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s27, v40, 30
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s27
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s28, v40, 31
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s28
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s29, v40, 32
-; GFX906-NEXT:    ;;#ASMSTART
-; GFX906-NEXT:    ; use s29
-; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX906-NEXT:    v_readlane_b32 s6, v40, 8
-; GFX906-NEXT:    v_readlane_b32 s8, v40, 6
-; GFX906-NEXT:    v_readlane_b32 s10, v40, 4
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX906-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX906-NEXT:    v_readlane_b32 s7, v40, 9
-; GFX906-NEXT:    v_readlane_b32 s9, v40, 7
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 5
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 3
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 2
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 1
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 0
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
-; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
-; GFX906-NEXT:    ; kill: killed $vgpr40
-; GFX906-NEXT:    v_readlane_b32 s4, v41, 2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[26:29] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[22:25] offset:80
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[18:21] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[14:17] offset:48
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[10:13] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[6:9] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_mov_b64 exec, -1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX906-NEXT:    s_addk_i32 s32, 0xd800
-; GFX906-NEXT:    s_mov_b32 s33, s4
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: preserve_wwm_copy_dstreg:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_mov_b32 s16, s33
-; GFX908-NEXT:    s_mov_b32 s33, s32
-; GFX908-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX908-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_mov_b64 exec, -1
-; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX908-NEXT:    v_mov_b32_e32 v3, s16
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_addk_i32 s32, 0x2c00
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b32 s21, s15
-; GFX908-NEXT:    ; implicit-def: $vgpr2
-; GFX908-NEXT:    s_mov_b32 s22, s14
-; GFX908-NEXT:    v_writelane_b32 v2, s21, 0
-; GFX908-NEXT:    v_writelane_b32 v2, s22, 1
-; GFX908-NEXT:    s_mov_b32 s23, s13
-; GFX908-NEXT:    v_writelane_b32 v2, s23, 2
-; GFX908-NEXT:    s_mov_b32 s24, s12
-; GFX908-NEXT:    v_writelane_b32 v2, s24, 3
-; GFX908-NEXT:    s_mov_b64 s[26:27], s[10:11]
-; GFX908-NEXT:    v_writelane_b32 v2, s26, 4
-; GFX908-NEXT:    v_writelane_b32 v2, s27, 5
-; GFX908-NEXT:    v_writelane_b32 v2, s8, 6
-; GFX908-NEXT:    v_writelane_b32 v2, s9, 7
-; GFX908-NEXT:    v_writelane_b32 v2, s6, 8
-; GFX908-NEXT:    v_writelane_b32 v2, s7, 9
-; GFX908-NEXT:    v_writelane_b32 v2, s4, 10
-; GFX908-NEXT:    v_mov_b32_e32 v32, v31
-; GFX908-NEXT:    v_writelane_b32 v2, s5, 11
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    v_mov_b32_e32 v33, v2
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def v[0:31]
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def v40
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s11
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    v_mov_b32_e32 v40, v33
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_writelane_b32 v40, s11, 12
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s12
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s12, 13
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s13
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s13, 14
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s14
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s14, 15
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s15
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s15, 16
-; GFX908-NEXT:    s_getpc_b64 s[10:11]
-; GFX908-NEXT:    s_add_u32 s10, s10, foo at gotpcrel32@lo+4
-; GFX908-NEXT:    s_addc_u32 s11, s11, foo at gotpcrel32@hi+12
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s16
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s16, 17
-; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[10:11], 0x0
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s17
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s17, 18
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s18
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s18, 19
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s19
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s19, 20
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s20
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s20, 21
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_writelane_b32 v40, s10, 22
-; GFX908-NEXT:    v_writelane_b32 v40, s11, 23
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX908-NEXT:    s_mov_b32 s12, s24
-; GFX908-NEXT:    s_mov_b32 s13, s23
-; GFX908-NEXT:    s_mov_b32 s14, s22
-; GFX908-NEXT:    v_mov_b32_e32 v31, v32
-; GFX908-NEXT:    s_mov_b32 s15, s21
-; GFX908-NEXT:    s_mov_b64 s[10:11], s[26:27]
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX908-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 12
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s11
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 13
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s12
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 14
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s13
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 15
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s14
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 16
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s15
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 17
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s16
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 18
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s17
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s18, v40, 19
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s18
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s19, v40, 20
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s19
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s20, v40, 21
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s20
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s21
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s22
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s23
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s24
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s25
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s26
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s27
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s28
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; def s29
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX908-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX908-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX908-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX908-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX908-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX908-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX908-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX908-NEXT:    v_writelane_b32 v40, s29, 32
-; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX908-NEXT:    v_readlane_b32 s6, v40, 8
-; GFX908-NEXT:    v_readlane_b32 s8, v40, 6
-; GFX908-NEXT:    v_readlane_b32 s10, v40, 4
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 3
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 2
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 1
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 0
-; GFX908-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX908-NEXT:    v_readlane_b32 s7, v40, 9
-; GFX908-NEXT:    v_readlane_b32 s9, v40, 7
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 5
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s21, v40, 24
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s21
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s22, v40, 25
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s22
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s23, v40, 26
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s23
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s24, v40, 27
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s24
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s25, v40, 28
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s25
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s26, v40, 29
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s26
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s27, v40, 30
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s27
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s28, v40, 31
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s28
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s29, v40, 32
-; GFX908-NEXT:    ;;#ASMSTART
-; GFX908-NEXT:    ; use s29
-; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX908-NEXT:    v_readlane_b32 s6, v40, 8
-; GFX908-NEXT:    v_readlane_b32 s8, v40, 6
-; GFX908-NEXT:    v_readlane_b32 s10, v40, 4
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX908-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX908-NEXT:    v_readlane_b32 s7, v40, 9
-; GFX908-NEXT:    v_readlane_b32 s9, v40, 7
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 5
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 3
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 2
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 1
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 0
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[26:29] offset:96
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[22:25] offset:80
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[18:21] offset:64
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[14:17] offset:48
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[10:13] offset:32
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[6:9] offset:16
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readlane_b32 s31, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
-; GFX908-NEXT:    ; kill: killed $vgpr40
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX908-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_mov_b64 exec, -1
-; GFX908-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_addk_i32 s32, 0xd400
-; GFX908-NEXT:    s_mov_b32 s33, s4
-; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-  %vreg0 = call <32 x float> asm sideeffect "; def $0", "=v"()
-  %v40 = call i32 asm sideeffect "; def $0","=${v40}"()
-
-  %s11 = call i32 asm sideeffect "; def $0","=${s11}"()
-  %s12 = call i32 asm sideeffect "; def $0","=${s12}"()
-  %s13 = call i32 asm sideeffect "; def $0","=${s13}"()
-  %s14 = call i32 asm sideeffect "; def $0","=${s14}"()
-  %s15 = call i32 asm sideeffect "; def $0","=${s15}"()
-  %s16 = call i32 asm sideeffect "; def $0","=${s16}"()
-  %s17 = call i32 asm sideeffect "; def $0","=${s17}"()
-  %s18 = call i32 asm sideeffect "; def $0","=${s18}"()
-  %s19 = call i32 asm sideeffect "; def $0","=${s19}"()
-  %s20 = call i32 asm sideeffect "; def $0","=${s20}"()
-  call void @foo()
-  call void asm sideeffect "; use $0","${s11}"(i32 %s11)
-  call void asm sideeffect "; use $0","${s12}"(i32 %s12)
-  call void asm sideeffect "; use $0","${s13}"(i32 %s13)
-  call void asm sideeffect "; use $0","${s14}"(i32 %s14)
-  call void asm sideeffect "; use $0","${s15}"(i32 %s15)
-  call void asm sideeffect "; use $0","${s16}"(i32 %s16)
-  call void asm sideeffect "; use $0","${s17}"(i32 %s17)
-  call void asm sideeffect "; use $0","${s18}"(i32 %s18)
-  call void asm sideeffect "; use $0","${s19}"(i32 %s19)
-  call void asm sideeffect "; use $0","${s20}"(i32 %s20)
-
-  %s21 = call i32 asm sideeffect "; def $0","=${s21}"()
-  %s22 = call i32 asm sideeffect "; def $0","=${s22}"()
-  %s23 = call i32 asm sideeffect "; def $0","=${s23}"()
-  %s24 = call i32 asm sideeffect "; def $0","=${s24}"()
-  %s25 = call i32 asm sideeffect "; def $0","=${s25}"()
-  %s26 = call i32 asm sideeffect "; def $0","=${s26}"()
-  %s27 = call i32 asm sideeffect "; def $0","=${s27}"()
-  %s28 = call i32 asm sideeffect "; def $0","=${s28}"()
-  %s29 = call i32 asm sideeffect "; def $0","=${s29}"()
-  call void @foo()
-  call void asm sideeffect "; use $0","${s21}"(i32 %s21)
-  call void asm sideeffect "; use $0","${s22}"(i32 %s22)
-  call void asm sideeffect "; use $0","${s23}"(i32 %s23)
-  call void asm sideeffect "; use $0","${s24}"(i32 %s24)
-  call void asm sideeffect "; use $0","${s25}"(i32 %s25)
-  call void asm sideeffect "; use $0","${s26}"(i32 %s26)
-  call void asm sideeffect "; use $0","${s27}"(i32 %s27)
-  call void asm sideeffect "; use $0","${s28}"(i32 %s28)
-  call void asm sideeffect "; use $0","${s29}"(i32 %s29)
-
-  call void @foo()
-
-  store volatile <32 x float> %vreg0, ptr %parg0
-
-  ret void
-}
-
-declare void @foo()
-
-attributes #0 = { "amdgpu-num-vgpr"="42" "amdgpu-num-sgpr"="40"}

diff  --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 09cd01af5bccb4..9d7797990287ab 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -1,381 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
 
-; This was a negative test to catch an extreme case when all options are exhausted
-; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs
-; the edge case won't arise and the test would always compile.
+; This ends up needing to spill SGPRs to memory, and also does not
+; have any free SGPRs available to save the exec mask when doing so.
+; The register scavenger also needs to use the emergency stack slot,
+; which tries to place the scavenged register restore instruction as
+; far the block as possible, near the terminator. This places a
+; restore instruction between the condition and the conditional
+; branch, which gets expanded into a sequence involving s_not_b64 on
+; the exec mask, clobbering SCC value before the branch. We probably
+; have to stop relying on being able to flip and restore the exec
+; mask, and always require a free SGPR for saving exec.
 
+; CHECK: *** Bad machine code: Using an undefined physical register ***
+; CHECK-NEXT: - function:    kernel0
+; CHECK-NEXT: - basic block: %bb.0
+; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+; CHECK-NEXT: - operand 1:   implicit killed $scc
 define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
-; CHECK-LABEL: kernel0:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ; implicit-def: $vgpr23
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[2:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 0
-; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:19]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[2:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 31
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 35
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 43
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[16:31]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[52:53]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[48:51]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:43]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 53
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 57
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 58
-; CHECK-NEXT:    ; implicit-def: $vgpr0
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 59
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[34:35]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:47]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 60
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 61
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 19
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[54:55]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
-; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
-; CHECK-NEXT:  ; %bb.1: ; %ret
-; CHECK-NEXT:    ; kill: killed $vgpr23
-; CHECK-NEXT:    ; kill: killed $vgpr0
-; CHECK-NEXT:    s_endpgm
-; CHECK-NEXT:  .LBB0_2: ; %bb0
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 0
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:1]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:1]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 35
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 37
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 41
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 43
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 51
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[16:31]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[52:53]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[48:51]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:43]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 53
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 57
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 59
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 61
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[34:35]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:47]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 19
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[54:55]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ; kill: killed $vgpr23
-; CHECK-NEXT:    ; kill: killed $vgpr0
-; CHECK-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
   call void asm sideeffect "", "~{v[16:19]}"() #0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ba506a88e1665f..61d4c7d271e3a3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -21,7 +21,6 @@
 ; DEFAULT-NEXT: Virtual Register Map
 ; DEFAULT-NEXT: Live Register Matrix
 ; DEFAULT-NEXT: Greedy Register Allocator
-; DEFAULT-NEXT: SI Lower WWM Copies
 ; DEFAULT-NEXT: GCN NSA Reassign
 ; DEFAULT-NEXT: Virtual Register Rewriter
 ; DEFAULT-NEXT: Stack Slot Coloring
@@ -29,7 +28,6 @@
 ; O0: Fast Register Allocator
 ; O0-NEXT: SI lower SGPR spill instructions
 ; O0-NEXT: Fast Register Allocator
-; O0-NEXT: SI Lower WWM Copies
 ; O0-NEXT: SI Fix VGPR copies
 
 
@@ -51,7 +49,6 @@
 ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
 ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
 ; BASIC-DEFAULT-NEXT: Greedy Register Allocator
-; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
 ; BASIC-DEFAULT-NEXT: GCN NSA Reassign
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
 ; BASIC-DEFAULT-NEXT: Stack Slot Coloring
@@ -64,7 +61,6 @@
 ; DEFAULT-BASIC-NEXT: Virtual Register Map
 ; DEFAULT-BASIC-NEXT: Live Register Matrix
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
-; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
 ; DEFAULT-BASIC-NEXT: Stack Slot Coloring
@@ -83,7 +79,6 @@
 ; BASIC-BASIC-NEXT: Virtual Register Map
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: Basic Register Allocator
-; BASIC-BASIC-NEXT: SI Lower WWM Copies
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
 ; BASIC-BASIC-NEXT: Stack Slot Coloring

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
index 341fe2d9b453fa..687adc69bd1486 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
@@ -1,5 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s
 
 # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0.
 # Otherwise, the test would crash during PEI while trying to replace the dead frame index.
@@ -40,33 +39,13 @@ machineFunctionInfo:
     workGroupIDX:    { reg: '$sgpr8' }
     privateSegmentWaveByteOffset: { reg: '$sgpr9' }
 body:             |
-  ; SGPR_SPILL-LABEL: name: test
-  ; SGPR_SPILL: bb.0:
-  ; SGPR_SPILL-NEXT:   successors: %bb.1(0x80000000)
-  ; SGPR_SPILL-NEXT: {{  $}}
-  ; SGPR_SPILL-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; SGPR_SPILL-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; SGPR_SPILL-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-  ; SGPR_SPILL-NEXT:   DBG_VALUE $noreg, 0
-  ; SGPR_SPILL-NEXT: {{  $}}
-  ; SGPR_SPILL-NEXT: bb.1:
-  ; SGPR_SPILL-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
-  ; SGPR_SPILL-NEXT:   KILL [[V_WRITELANE_B32_]]
-  ; SGPR_SPILL-NEXT:   S_ENDPGM 0
-  ; PEI-LABEL: name: test
-  ; PEI: bb.0:
-  ; PEI-NEXT:   successors: %bb.1(0x80000000)
-  ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; PEI-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; PEI-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0
-  ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT: bb.1:
-  ; PEI-NEXT:   liveins: $vgpr0
-  ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 0
-  ; PEI-NEXT:   KILL killed renamable $vgpr0
-  ; PEI-NEXT:   S_ENDPGM 0
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK:   $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0
+  ; CHECK:   DBG_VALUE $noreg, 0
+  ; CHECK: bb.1:
+  ; CHECK:   $sgpr10 = V_READLANE_B32 $vgpr0, 0
+  ; CHECK:   S_ENDPGM 0
   bb.0:
     renamable $sgpr10 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir
index a6cb7d4af7641c..4694810379fe05 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s
 
 # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0.
 # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise.
@@ -45,7 +45,7 @@ machineFunctionInfo:
 body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0:
-  ; CHECK:   DBG_VALUE
+  ; CHECK:   DBG_VALUE $noreg, 0
   bb.0:
     renamable $sgpr10 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 90307f1c7905b9..96f0287fb7779d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -9,17 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:    s_load_dword s4, s[8:9], 0x2
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -31,179 +23,179 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s8, 0
-; GCN-NEXT:    v_writelane_b32 v1, s9, 1
-; GCN-NEXT:    v_writelane_b32 v1, s10, 2
-; GCN-NEXT:    v_writelane_b32 v1, s11, 3
-; GCN-NEXT:    v_writelane_b32 v1, s12, 4
-; GCN-NEXT:    v_writelane_b32 v1, s13, 5
-; GCN-NEXT:    v_writelane_b32 v1, s14, 6
-; GCN-NEXT:    v_writelane_b32 v1, s15, 7
-; GCN-NEXT:    v_writelane_b32 v1, s16, 8
-; GCN-NEXT:    v_writelane_b32 v1, s17, 9
-; GCN-NEXT:    v_writelane_b32 v1, s18, 10
-; GCN-NEXT:    v_writelane_b32 v1, s19, 11
-; GCN-NEXT:    v_writelane_b32 v1, s20, 12
-; GCN-NEXT:    v_writelane_b32 v1, s21, 13
-; GCN-NEXT:    v_writelane_b32 v1, s22, 14
-; GCN-NEXT:    v_writelane_b32 v1, s23, 15
+; GCN-NEXT:    v_writelane_b32 v23, s8, 0
+; GCN-NEXT:    v_writelane_b32 v23, s9, 1
+; GCN-NEXT:    v_writelane_b32 v23, s10, 2
+; GCN-NEXT:    v_writelane_b32 v23, s11, 3
+; GCN-NEXT:    v_writelane_b32 v23, s12, 4
+; GCN-NEXT:    v_writelane_b32 v23, s13, 5
+; GCN-NEXT:    v_writelane_b32 v23, s14, 6
+; GCN-NEXT:    v_writelane_b32 v23, s15, 7
+; GCN-NEXT:    v_writelane_b32 v23, s16, 8
+; GCN-NEXT:    v_writelane_b32 v23, s17, 9
+; GCN-NEXT:    v_writelane_b32 v23, s18, 10
+; GCN-NEXT:    v_writelane_b32 v23, s19, 11
+; GCN-NEXT:    v_writelane_b32 v23, s20, 12
+; GCN-NEXT:    v_writelane_b32 v23, s21, 13
+; GCN-NEXT:    v_writelane_b32 v23, s22, 14
+; GCN-NEXT:    v_writelane_b32 v23, s23, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 16
-; GCN-NEXT:    v_writelane_b32 v1, s9, 17
-; GCN-NEXT:    v_writelane_b32 v1, s10, 18
-; GCN-NEXT:    v_writelane_b32 v1, s11, 19
-; GCN-NEXT:    v_writelane_b32 v1, s12, 20
-; GCN-NEXT:    v_writelane_b32 v1, s13, 21
-; GCN-NEXT:    v_writelane_b32 v1, s14, 22
-; GCN-NEXT:    v_writelane_b32 v1, s15, 23
-; GCN-NEXT:    v_writelane_b32 v1, s16, 24
-; GCN-NEXT:    v_writelane_b32 v1, s17, 25
-; GCN-NEXT:    v_writelane_b32 v1, s18, 26
-; GCN-NEXT:    v_writelane_b32 v1, s19, 27
-; GCN-NEXT:    v_writelane_b32 v1, s20, 28
-; GCN-NEXT:    v_writelane_b32 v1, s21, 29
-; GCN-NEXT:    v_writelane_b32 v1, s22, 30
-; GCN-NEXT:    v_writelane_b32 v1, s23, 31
+; GCN-NEXT:    v_writelane_b32 v23, s8, 16
+; GCN-NEXT:    v_writelane_b32 v23, s9, 17
+; GCN-NEXT:    v_writelane_b32 v23, s10, 18
+; GCN-NEXT:    v_writelane_b32 v23, s11, 19
+; GCN-NEXT:    v_writelane_b32 v23, s12, 20
+; GCN-NEXT:    v_writelane_b32 v23, s13, 21
+; GCN-NEXT:    v_writelane_b32 v23, s14, 22
+; GCN-NEXT:    v_writelane_b32 v23, s15, 23
+; GCN-NEXT:    v_writelane_b32 v23, s16, 24
+; GCN-NEXT:    v_writelane_b32 v23, s17, 25
+; GCN-NEXT:    v_writelane_b32 v23, s18, 26
+; GCN-NEXT:    v_writelane_b32 v23, s19, 27
+; GCN-NEXT:    v_writelane_b32 v23, s20, 28
+; GCN-NEXT:    v_writelane_b32 v23, s21, 29
+; GCN-NEXT:    v_writelane_b32 v23, s22, 30
+; GCN-NEXT:    v_writelane_b32 v23, s23, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 32
-; GCN-NEXT:    v_writelane_b32 v1, s9, 33
-; GCN-NEXT:    v_writelane_b32 v1, s10, 34
-; GCN-NEXT:    v_writelane_b32 v1, s11, 35
-; GCN-NEXT:    v_writelane_b32 v1, s12, 36
-; GCN-NEXT:    v_writelane_b32 v1, s13, 37
-; GCN-NEXT:    v_writelane_b32 v1, s14, 38
-; GCN-NEXT:    v_writelane_b32 v1, s15, 39
-; GCN-NEXT:    v_writelane_b32 v1, s16, 40
-; GCN-NEXT:    v_writelane_b32 v1, s17, 41
-; GCN-NEXT:    v_writelane_b32 v1, s18, 42
-; GCN-NEXT:    v_writelane_b32 v1, s19, 43
-; GCN-NEXT:    v_writelane_b32 v1, s20, 44
-; GCN-NEXT:    v_writelane_b32 v1, s21, 45
-; GCN-NEXT:    v_writelane_b32 v1, s22, 46
-; GCN-NEXT:    v_writelane_b32 v1, s23, 47
+; GCN-NEXT:    v_writelane_b32 v23, s8, 32
+; GCN-NEXT:    v_writelane_b32 v23, s9, 33
+; GCN-NEXT:    v_writelane_b32 v23, s10, 34
+; GCN-NEXT:    v_writelane_b32 v23, s11, 35
+; GCN-NEXT:    v_writelane_b32 v23, s12, 36
+; GCN-NEXT:    v_writelane_b32 v23, s13, 37
+; GCN-NEXT:    v_writelane_b32 v23, s14, 38
+; GCN-NEXT:    v_writelane_b32 v23, s15, 39
+; GCN-NEXT:    v_writelane_b32 v23, s16, 40
+; GCN-NEXT:    v_writelane_b32 v23, s17, 41
+; GCN-NEXT:    v_writelane_b32 v23, s18, 42
+; GCN-NEXT:    v_writelane_b32 v23, s19, 43
+; GCN-NEXT:    v_writelane_b32 v23, s20, 44
+; GCN-NEXT:    v_writelane_b32 v23, s21, 45
+; GCN-NEXT:    v_writelane_b32 v23, s22, 46
+; GCN-NEXT:    v_writelane_b32 v23, s23, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 48
-; GCN-NEXT:    v_writelane_b32 v1, s9, 49
-; GCN-NEXT:    v_writelane_b32 v1, s10, 50
-; GCN-NEXT:    v_writelane_b32 v1, s11, 51
-; GCN-NEXT:    v_writelane_b32 v1, s12, 52
-; GCN-NEXT:    v_writelane_b32 v1, s13, 53
-; GCN-NEXT:    v_writelane_b32 v1, s14, 54
-; GCN-NEXT:    v_writelane_b32 v1, s15, 55
-; GCN-NEXT:    v_writelane_b32 v1, s16, 56
-; GCN-NEXT:    v_writelane_b32 v1, s17, 57
-; GCN-NEXT:    v_writelane_b32 v1, s18, 58
-; GCN-NEXT:    v_writelane_b32 v1, s19, 59
-; GCN-NEXT:    v_writelane_b32 v1, s20, 60
-; GCN-NEXT:    v_writelane_b32 v1, s21, 61
-; GCN-NEXT:    v_writelane_b32 v1, s22, 62
-; GCN-NEXT:    v_writelane_b32 v1, s23, 63
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
+; GCN-NEXT:    v_writelane_b32 v23, s8, 48
+; GCN-NEXT:    v_writelane_b32 v23, s9, 49
+; GCN-NEXT:    v_writelane_b32 v23, s10, 50
+; GCN-NEXT:    v_writelane_b32 v23, s11, 51
+; GCN-NEXT:    v_writelane_b32 v23, s12, 52
+; GCN-NEXT:    v_writelane_b32 v23, s13, 53
+; GCN-NEXT:    v_writelane_b32 v23, s14, 54
+; GCN-NEXT:    v_writelane_b32 v23, s15, 55
+; GCN-NEXT:    v_writelane_b32 v23, s16, 56
+; GCN-NEXT:    v_writelane_b32 v23, s17, 57
+; GCN-NEXT:    v_writelane_b32 v23, s18, 58
+; GCN-NEXT:    v_writelane_b32 v23, s19, 59
+; GCN-NEXT:    v_writelane_b32 v23, s20, 60
+; GCN-NEXT:    v_writelane_b32 v23, s21, 61
+; GCN-NEXT:    v_writelane_b32 v23, s22, 62
+; GCN-NEXT:    v_writelane_b32 v23, s23, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[6:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    v_writelane_b32 v0, s6, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s7, 1
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, s5
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v1, 0
-; GCN-NEXT:    v_readlane_b32 s5, v1, 1
-; GCN-NEXT:    v_readlane_b32 s6, v1, 2
-; GCN-NEXT:    v_readlane_b32 s7, v1, 3
-; GCN-NEXT:    v_readlane_b32 s8, v1, 4
-; GCN-NEXT:    v_readlane_b32 s9, v1, 5
-; GCN-NEXT:    v_readlane_b32 s10, v1, 6
-; GCN-NEXT:    v_readlane_b32 s11, v1, 7
-; GCN-NEXT:    v_readlane_b32 s12, v1, 8
-; GCN-NEXT:    v_readlane_b32 s13, v1, 9
-; GCN-NEXT:    v_readlane_b32 s14, v1, 10
-; GCN-NEXT:    v_readlane_b32 s15, v1, 11
-; GCN-NEXT:    v_readlane_b32 s16, v1, 12
-; GCN-NEXT:    v_readlane_b32 s17, v1, 13
-; GCN-NEXT:    v_readlane_b32 s18, v1, 14
-; GCN-NEXT:    v_readlane_b32 s19, v1, 15
+; GCN-NEXT:    v_readlane_b32 s4, v23, 0
+; GCN-NEXT:    v_readlane_b32 s5, v23, 1
+; GCN-NEXT:    v_readlane_b32 s6, v23, 2
+; GCN-NEXT:    v_readlane_b32 s7, v23, 3
+; GCN-NEXT:    v_readlane_b32 s8, v23, 4
+; GCN-NEXT:    v_readlane_b32 s9, v23, 5
+; GCN-NEXT:    v_readlane_b32 s10, v23, 6
+; GCN-NEXT:    v_readlane_b32 s11, v23, 7
+; GCN-NEXT:    v_readlane_b32 s12, v23, 8
+; GCN-NEXT:    v_readlane_b32 s13, v23, 9
+; GCN-NEXT:    v_readlane_b32 s14, v23, 10
+; GCN-NEXT:    v_readlane_b32 s15, v23, 11
+; GCN-NEXT:    v_readlane_b32 s16, v23, 12
+; GCN-NEXT:    v_readlane_b32 s17, v23, 13
+; GCN-NEXT:    v_readlane_b32 s18, v23, 14
+; GCN-NEXT:    v_readlane_b32 s19, v23, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 16
-; GCN-NEXT:    v_readlane_b32 s5, v1, 17
-; GCN-NEXT:    v_readlane_b32 s6, v1, 18
-; GCN-NEXT:    v_readlane_b32 s7, v1, 19
-; GCN-NEXT:    v_readlane_b32 s8, v1, 20
-; GCN-NEXT:    v_readlane_b32 s9, v1, 21
-; GCN-NEXT:    v_readlane_b32 s10, v1, 22
-; GCN-NEXT:    v_readlane_b32 s11, v1, 23
-; GCN-NEXT:    v_readlane_b32 s12, v1, 24
-; GCN-NEXT:    v_readlane_b32 s13, v1, 25
-; GCN-NEXT:    v_readlane_b32 s14, v1, 26
-; GCN-NEXT:    v_readlane_b32 s15, v1, 27
-; GCN-NEXT:    v_readlane_b32 s16, v1, 28
-; GCN-NEXT:    v_readlane_b32 s17, v1, 29
-; GCN-NEXT:    v_readlane_b32 s18, v1, 30
-; GCN-NEXT:    v_readlane_b32 s19, v1, 31
+; GCN-NEXT:    v_readlane_b32 s4, v23, 16
+; GCN-NEXT:    v_readlane_b32 s5, v23, 17
+; GCN-NEXT:    v_readlane_b32 s6, v23, 18
+; GCN-NEXT:    v_readlane_b32 s7, v23, 19
+; GCN-NEXT:    v_readlane_b32 s8, v23, 20
+; GCN-NEXT:    v_readlane_b32 s9, v23, 21
+; GCN-NEXT:    v_readlane_b32 s10, v23, 22
+; GCN-NEXT:    v_readlane_b32 s11, v23, 23
+; GCN-NEXT:    v_readlane_b32 s12, v23, 24
+; GCN-NEXT:    v_readlane_b32 s13, v23, 25
+; GCN-NEXT:    v_readlane_b32 s14, v23, 26
+; GCN-NEXT:    v_readlane_b32 s15, v23, 27
+; GCN-NEXT:    v_readlane_b32 s16, v23, 28
+; GCN-NEXT:    v_readlane_b32 s17, v23, 29
+; GCN-NEXT:    v_readlane_b32 s18, v23, 30
+; GCN-NEXT:    v_readlane_b32 s19, v23, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 32
-; GCN-NEXT:    v_readlane_b32 s5, v1, 33
-; GCN-NEXT:    v_readlane_b32 s6, v1, 34
-; GCN-NEXT:    v_readlane_b32 s7, v1, 35
-; GCN-NEXT:    v_readlane_b32 s8, v1, 36
-; GCN-NEXT:    v_readlane_b32 s9, v1, 37
-; GCN-NEXT:    v_readlane_b32 s10, v1, 38
-; GCN-NEXT:    v_readlane_b32 s11, v1, 39
-; GCN-NEXT:    v_readlane_b32 s12, v1, 40
-; GCN-NEXT:    v_readlane_b32 s13, v1, 41
-; GCN-NEXT:    v_readlane_b32 s14, v1, 42
-; GCN-NEXT:    v_readlane_b32 s15, v1, 43
-; GCN-NEXT:    v_readlane_b32 s16, v1, 44
-; GCN-NEXT:    v_readlane_b32 s17, v1, 45
-; GCN-NEXT:    v_readlane_b32 s18, v1, 46
-; GCN-NEXT:    v_readlane_b32 s19, v1, 47
+; GCN-NEXT:    v_readlane_b32 s4, v23, 32
+; GCN-NEXT:    v_readlane_b32 s5, v23, 33
+; GCN-NEXT:    v_readlane_b32 s6, v23, 34
+; GCN-NEXT:    v_readlane_b32 s7, v23, 35
+; GCN-NEXT:    v_readlane_b32 s8, v23, 36
+; GCN-NEXT:    v_readlane_b32 s9, v23, 37
+; GCN-NEXT:    v_readlane_b32 s10, v23, 38
+; GCN-NEXT:    v_readlane_b32 s11, v23, 39
+; GCN-NEXT:    v_readlane_b32 s12, v23, 40
+; GCN-NEXT:    v_readlane_b32 s13, v23, 41
+; GCN-NEXT:    v_readlane_b32 s14, v23, 42
+; GCN-NEXT:    v_readlane_b32 s15, v23, 43
+; GCN-NEXT:    v_readlane_b32 s16, v23, 44
+; GCN-NEXT:    v_readlane_b32 s17, v23, 45
+; GCN-NEXT:    v_readlane_b32 s18, v23, 46
+; GCN-NEXT:    v_readlane_b32 s19, v23, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s8, v1, 48
-; GCN-NEXT:    v_readlane_b32 s9, v1, 49
-; GCN-NEXT:    v_readlane_b32 s10, v1, 50
-; GCN-NEXT:    v_readlane_b32 s11, v1, 51
-; GCN-NEXT:    v_readlane_b32 s12, v1, 52
-; GCN-NEXT:    v_readlane_b32 s13, v1, 53
-; GCN-NEXT:    v_readlane_b32 s14, v1, 54
-; GCN-NEXT:    v_readlane_b32 s15, v1, 55
-; GCN-NEXT:    v_readlane_b32 s16, v1, 56
-; GCN-NEXT:    v_readlane_b32 s17, v1, 57
-; GCN-NEXT:    v_readlane_b32 s18, v1, 58
-; GCN-NEXT:    v_readlane_b32 s19, v1, 59
-; GCN-NEXT:    v_readlane_b32 s20, v1, 60
-; GCN-NEXT:    v_readlane_b32 s21, v1, 61
-; GCN-NEXT:    v_readlane_b32 s22, v1, 62
-; GCN-NEXT:    v_readlane_b32 s23, v1, 63
+; GCN-NEXT:    v_readlane_b32 s8, v23, 48
+; GCN-NEXT:    v_readlane_b32 s9, v23, 49
+; GCN-NEXT:    v_readlane_b32 s10, v23, 50
+; GCN-NEXT:    v_readlane_b32 s11, v23, 51
+; GCN-NEXT:    v_readlane_b32 s12, v23, 52
+; GCN-NEXT:    v_readlane_b32 s13, v23, 53
+; GCN-NEXT:    v_readlane_b32 s14, v23, 54
+; GCN-NEXT:    v_readlane_b32 s15, v23, 55
+; GCN-NEXT:    v_readlane_b32 s16, v23, 56
+; GCN-NEXT:    v_readlane_b32 s17, v23, 57
+; GCN-NEXT:    v_readlane_b32 s18, v23, 58
+; GCN-NEXT:    v_readlane_b32 s19, v23, 59
+; GCN-NEXT:    v_readlane_b32 s20, v23, 60
+; GCN-NEXT:    v_readlane_b32 s21, v23, 61
+; GCN-NEXT:    v_readlane_b32 s22, v23, 62
+; GCN-NEXT:    v_readlane_b32 s23, v23, 63
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
@@ -211,14 +203,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ; use s[4:5]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
index 26a5eedc3eca3d..9596d3b7f63595 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
@@ -20,11 +20,10 @@ body:             |
     liveins: $sgpr4
 
     ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32
-    ; CHECK: liveins: $sgpr4
+    ; CHECK: liveins: $sgpr4, $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
     SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
 
 ...
@@ -46,11 +45,10 @@ body:             |
     liveins: $sgpr5
 
     ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32
-    ; CHECK: liveins: $sgpr5
+    ; CHECK: liveins: $sgpr5, $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
     SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
index 813e702288bb19..b20f540cf24720 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
@@ -13,40 +13,32 @@ define amdgpu_kernel void @kernel() {
 ; GCN-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s38, -1
 ; GCN-NEXT:    s_mov_b32 s39, 0xe00000
-; GCN-NEXT:    ; implicit-def: $vgpr3
+; GCN-NEXT:    v_writelane_b32 v40, s4, 0
 ; GCN-NEXT:    s_add_u32 s36, s36, s11
-; GCN-NEXT:    v_writelane_b32 v3, s4, 0
-; GCN-NEXT:    s_movk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s5, 1
 ; GCN-NEXT:    s_addc_u32 s37, s37, 0
-; GCN-NEXT:    s_mov_b32 s14, s10
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_readlane_b32 s0, v40, 0
 ; GCN-NEXT:    s_mov_b32 s13, s9
 ; GCN-NEXT:    s_mov_b32 s12, s8
-; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GCN-NEXT:    v_writelane_b32 v3, s5, 1
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-NEXT:    v_readlane_b32 s0, v3, 0
-; GCN-NEXT:    v_readlane_b32 s1, v3, 1
+; GCN-NEXT:    v_readlane_b32 s1, v40, 1
 ; GCN-NEXT:    s_add_u32 s8, s0, 36
 ; GCN-NEXT:    s_addc_u32 s9, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
 ; GCN-NEXT:    s_add_u32 s0, s0, foo at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s14, s10
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void @foo()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
index c44892baca366e..bbbe87848ae4e3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s
 
 # Check that we allocate 2 emergency stack slots if we're spilling
 # SGPRs to memory and potentially have an offset larger than fits in
@@ -29,7 +29,7 @@ body:             |
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
     ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr1
+    ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index f4ae5877d1063d..a41dbdfecdd675 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -16,10 +16,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s18, s33
+; GCN-NEXT:    s_mov_b32 s24, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
@@ -135,13 +135,13 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v255, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v255, s31, 1
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:444
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function at gotpcrel32@hi+12
@@ -266,10 +266,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT:    s_mov_b32 s33, s18
+; GCN-NEXT:    s_mov_b32 s33, s24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
@@ -310,10 +310,10 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-LABEL: spill_to_lowest_available_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s18, s33
+; GCN-NEXT:    s_mov_b32 s24, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
@@ -428,13 +428,13 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v254, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v254, s31, 1
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function at gotpcrel32@hi+12
@@ -558,10 +558,10 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT:    s_mov_b32 s33, s18
+; GCN-NEXT:    s_mov_b32 s33, s24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
@@ -602,8 +602,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-LABEL: spill_sgpr_with_sgpr_uses:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
@@ -715,38 +715,22 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:440
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s4
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_writelane_b32 v254, s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN-NEXT:    v_readlane_b32 s4, v254, 0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s4
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -857,8 +841,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1183,7 +1167,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -1315,17 +1299,17 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GCN-NEXT:    flat_load_dwordx4 v[5:8], v[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -1447,7 +1431,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1528,7 +1512,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s18, s33
+; GCN-NEXT:    s_mov_b32 s24, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
@@ -1798,7 +1782,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT:    s_mov_b32 s33, s18
+; GCN-NEXT:    s_mov_b32 s33, s24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void @child_function_ipra()

diff  --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index 3a5638344d751c..f82b9e4637bac9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -6,17 +6,17 @@
 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
 
 ; Make sure we are handling hazards correctly.
-; SGPR: v_mov_b32_e32 v0, vcc_lo
-; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1
-; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload
-; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]]
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1
 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2
 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3
-; SGPR-NEXT: ; kill: killed $vgpr1
-; SGPR-NEXT: s_nop 4
+; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0
+; SGPR-NEXT: s_waitcnt vmcnt(0)
+; SGPR-NEXT: s_mov_b64 exec, s[4:5]
+; SGPR-NEXT: s_nop 1
+; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0
 
 ; ALL: s_endpgm
 define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {

diff  --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index a45f6ed9ef0452..1c426e281df31d 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -202,9 +202,10 @@ entry:
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
-; GCN-DAG: s_addk_i32 s32, 0x400
-; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2
+; GCN-DAG: s_addk_i32 s32, 0x800
+; GCN: v_writelane_b32 [[CSRV_1]], [[FP_SCRATCH_COPY]], 0
 
 ; GCN-DAG: s_getpc_b64 s[4:5]
 ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
@@ -226,11 +227,12 @@ entry:
 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
+; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV_1]], 0
 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_addk_i32 s32, 0xf800
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 55639a27fd5c74..4a2996dcd2c6a7 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -33,7 +33,6 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead renamable $vgpr0 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr41 = IMPLICIT_DEF
@@ -42,15 +41,7 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
   ; CHECK-NEXT:   dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr1, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr5, 1, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr6, 2, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr7, 3, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr8, 4, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr9, 5, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr10, 6, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 killed $sgpr11, 7, killed $vgpr1, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $vgpr1 = COPY renamable $sgpr51
@@ -63,50 +54,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 1
-  ; CHECK-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr1, 2
-  ; CHECK-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr1, 3
-  ; CHECK-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr1, 4
-  ; CHECK-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr1, 5
-  ; CHECK-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr1, 6
-  ; CHECK-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr1, 7
-  ; CHECK-NEXT:   $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $exec = S_MOV_B64 killed $noreg
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 1
-  ; CHECK-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr1, 2
-  ; CHECK-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr1, 3
-  ; CHECK-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr1, 4
-  ; CHECK-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr1, 5
-  ; CHECK-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr1, 6
-  ; CHECK-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr1, 7
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc
-  ; CHECK-NEXT:   $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $exec = S_MOV_B64 killed $noreg
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.5, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
@@ -116,7 +87,6 @@ body:             |
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   KILL killed renamable $vgpr1
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index b045dd559aac26..8d49223215ca57 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -2,22 +2,20 @@
 
 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
 ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN: s_xor_saveexec_b64
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN: s_or_saveexec_b64
 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
+; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
 ; GCN: s_swappc_b64
 
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
 
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
-; GCN: s_xor_saveexec_b64
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
+; GCN: s_or_saveexec_b64
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN: s_mov_b64 exec
 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN: s_setpc_b64

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
index 95000a143de808..6c18983672d7dd 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 
 # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted
 # by the copy propagation after lowering the spill.
@@ -25,14 +25,12 @@ body:             |
     ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1
-    ; GCN-NEXT: KILL killed renamable $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
@@ -64,13 +62,11 @@ body:             |
     ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: KILL killed renamable $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
@@ -97,12 +93,12 @@ body:             |
     ; GCN-LABEL: name: spill_vgpr128_use_subreg
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec
+    ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5)
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec
+    ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1
     ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8
     renamable $vgpr1 = COPY $vgpr2
     SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -127,11 +123,11 @@ body:             |
     ; GCN-LABEL: name: spill_vgpr128_use_kill
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5)
+    ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5)
     ; GCN-NEXT: S_ENDPGM 0
     renamable $vgpr1 = COPY $vgpr2
     SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
deleted file mode 100644
index a77e315dc54a47..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ /dev/null
@@ -1,319 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
-# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction.
----
-name:            sgpr32_spill
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  bb.0:
-    liveins: $sgpr30_sgpr31, $sgpr10
-    ; GCN-LABEL: name: sgpr32_spill
-    ; GCN: liveins: $sgpr30_sgpr31, $sgpr10
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-    ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
-    ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
-    S_NOP 0
-    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_SETPC_B64 $sgpr30_sgpr31
-...
-
-# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple.
-# There must be two implicit def for the two lane VGPRs.
-
----
-name:            sgpr_spill_lane_crossover
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-  - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  bb.0:
-    liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-LABEL: name: sgpr_spill_lane_crossover
-    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr64, 0, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr65, 1, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr66, 2, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr67, 3, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr68, 4, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr69, 5, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr70, 6, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr71, 7, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr72, 8, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr73, 9, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr74, 10, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr75, 11, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr76, 12, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr77, 13, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr78, 14, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr79, 15, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr80, 16, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr81, 17, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr82, 18, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr83, 19, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr84, 20, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr85, 21, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr86, 22, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr87, 23, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr88, 24, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr89, 25, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr90, 26, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr91, 27, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr92, 28, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr93, 29, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr94, 30, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr95, 31, $vgpr0
-    ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 1, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 2, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 3, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 4, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 5, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 6, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 7, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 8, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 9, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 10, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 11, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 12, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 13, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 14, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 15, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 16, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 17, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 18, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 19, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 20, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 21, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 22, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 23, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 24, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 25, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 26, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 27, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 28, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 29, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 30, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 31, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 32, [[V_WRITELANE_B32_1]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-    ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-    ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-    ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-    ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-    ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-    ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-    ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-    ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
-    ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11
-    ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 12
-    ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 13
-    ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 14
-    ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 15
-    ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 16
-    ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 17
-    ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 18
-    ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 19
-    ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 20
-    ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 21
-    ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 22
-    ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 23
-    ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 24
-    ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 25
-    ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 26
-    ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 27
-    ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 28
-    ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 29
-    ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 30
-    ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 31
-    ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32
-    ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0
-    ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
-    S_NOP 0
-    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_NOP 0
-    renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_SETPC_B64 $sgpr30_sgpr31
-...
-
-# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here).
-
----
-name:            lane_vgpr_implicit_def_at_common_dominator_block
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = S_MOV_B32 10
-  ; GCN-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = S_MOV_B32 20
-  ; GCN-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]]
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0
-  ; GCN-NEXT:   S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10
-  bb.0:
-    liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-    S_NOP 0
-    S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  bb.1:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    $sgpr10 = S_MOV_B32 10
-    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_BRANCH %bb.3
-  bb.2:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    $sgpr10 = S_MOV_B32 20
-    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_BRANCH %bb.3
-  bb.3:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10
-...
-
-# The common dominator block is visited only at the end. The insertion point was initially identified to the
-# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block.
-
----
-name:            dominator_block_follows_the_successors_bbs
-tracksRegLiveness: true
-frameInfo:
-  maxAlignment:    4
-stack:
-  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
-machineFunctionInfo:
-  isEntryFunction: false
-  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  stackPtrOffsetReg: '$sgpr32'
-  frameOffsetReg: '$sgpr33'
-  hasSpilledSGPRs: true
-body:             |
-  ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[DEF]], 0
-  ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[DEF]], 0
-  ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
-  ; GCN-NEXT:   S_BRANCH %bb.3
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = S_MOV_B32 10
-  ; GCN-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-  ; GCN-NEXT:   S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN-NEXT:   S_BRANCH %bb.1
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.4:
-  ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10
-  bb.0:
-    liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-    S_NOP 0
-    S_BRANCH %bb.3
-  bb.1:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
-    S_BRANCH %bb.2
-  bb.2:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
-    S_BRANCH %bb.3
-  bb.3:
-    liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
-    $sgpr10 = S_MOV_B32 10
-    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
-    S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-    S_BRANCH %bb.1
-  bb.4:
-    liveins: $sgpr10, $sgpr30_sgpr31
-    S_NOP 0
-    S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
deleted file mode 100644
index 4639f28d68061d..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s
-
-; Regression test for `processFunctionBeforeFrameFinalized`:
-; Check that it correctly updates RegisterScavenger so we
-; don't end up with bad machine code due to using undefined
-; physical registers.
-
-define void @test() {
-; CHECK-LABEL: test:
-; CHECK:       ; %bb.0: ; %bb.0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr0
-; CHECK-NEXT:  .LBB0_1: ; %bb.1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_3
-; CHECK-NEXT:  ; %bb.2: ; %bb.2
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:  .LBB0_3: ; %bb.3
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    ; implicit-def: $sgpr4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_readfirstlane_b32 s6, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], -1
-; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    s_cmp_eq_u32 s6, s7
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    s_mov_b64 s[10:11], exec
-; CHECK-NEXT:    s_mov_b64 exec, -1
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_5
-; CHECK-NEXT:  ; %bb.4: ; %bb.4
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:  .LBB0_5: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 1
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CHECK-NEXT:    s_mov_b32 s4, 1
-; CHECK-NEXT:    ; implicit-def: $sgpr5
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s4
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
-; CHECK-NEXT:  ; %bb.6: ; %bb.5
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    ; kill: killed $vgpr0
-; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-bb.0:
-  br label %bb.1
-bb.1:                                                ; preds = %bb.4, %bb.0
-  br i1 poison, label %bb.2, label %bb.3
-bb.2:                                                ; preds = %bb.1
-  br label %bb.3
-bb.3:                                                ; preds = %bb.2, %bb.1
-  %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison)
-  %cmp = icmp eq i32 %call, 0
-  br i1 %cmp, label %bb.5, label %bb.4
-bb.4:                                                ; preds = %bb.3
-  br label %bb.1
-bb.5:                                                ; preds = %bb.3
-  ret void
-}
-
-declare i32 @llvm.amdgcn.readfirstlane(i32)

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
index c1fc297d456435..b1cc01ec9c1a06 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
@@ -10,16 +10,16 @@ define void @sgpr_spill_writelane() {
 ; GCN-LABEL: sgpr_spill_writelane:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v0, s35, 0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    v_readlane_b32 s35, v0, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "~{s35}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir
index ff9aeecfaa6739..1834b889502a88 100644
--- a/llvm/test/CodeGen/AMDGPU/spill192.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill192.mir
@@ -32,29 +32,32 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr192
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_192

diff  --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir
index 0f84821687d716..8a5ffd32d01a71 100644
--- a/llvm/test/CodeGen/AMDGPU/spill224.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill224.mir
@@ -30,31 +30,34 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr224
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
+  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 6
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_224

diff  --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir
index 219d69fa2d46a1..c5d56dd2cb04ff 100644
--- a/llvm/test/CodeGen/AMDGPU/spill288.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill288.mir
@@ -30,35 +30,38 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr288
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 8, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
+  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 6
+  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr0, 7
+  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 $vgpr0, 8
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_288

diff  --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir
index dcb4b9dda8783c..b54d43cae3acdc 100644
--- a/llvm/test/CodeGen/AMDGPU/spill320.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill320.mir
@@ -30,37 +30,40 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr320
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr13, 9, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
+  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 6
+  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr0, 7
+  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 $vgpr0, 8
+  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 $vgpr0, 9
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_320

diff  --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir
index 2ed4437aa57cd1..0e215c707e1fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/spill352.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill352.mir
@@ -30,39 +30,42 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr352
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr14, 10, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
+  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 6
+  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr0, 7
+  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 $vgpr0, 8
+  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 $vgpr0, 9
+  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 $vgpr0, 10
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_352

diff  --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir
index c88bfe20a75fe3..f0cee62a3baeb0 100644
--- a/llvm/test/CodeGen/AMDGPU/spill384.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill384.mir
@@ -30,41 +30,44 @@ body: |
   ; EXPANDED-LABEL: name: spill_restore_sgpr384
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr14, 10, $vgpr0
+  ; EXPANDED-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr15, 11, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
   ; EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; EXPANDED-NEXT:   liveins: $vgpr0
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
-  ; EXPANDED-NEXT:   $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11
+  ; EXPANDED-NEXT:   liveins: $vgpr0
+  ; EXPANDED-NEXT: {{  $}}
+  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 2
+  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr0, 3
+  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 4
+  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr0, 5
+  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 6
+  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr0, 7
+  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 $vgpr0, 8
+  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 $vgpr0, 9
+  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 $vgpr0, 10
+  ; EXPANDED-NEXT:   $sgpr15 = V_READLANE_B32 $vgpr0, 11
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_384

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 4aeff3f23993aa..510849d83690f5 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -161,24 +161,26 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
 ; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1
 ; GCN: s_mov_b32 s34, s32
 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-DAG: s_add_i32 s32, s32, 0x30000
-; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
+; GCN: v_writelane_b32 [[VGPR_REG_1]], [[FP_SCRATCH_COPY]], 0
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN: s_swappc_b64 s[30:31],
 
 ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
-; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1
+; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG_1]], 0
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
 ; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]

diff  --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index b7d3a96042569d..0f5ee3f8abffce 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,96 +16,94 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
 ; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 2
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 4
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 6
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s3
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], 0
 ; CHECK-NEXT:    s_mov_b32 s2, 0
 ; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 2
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s1, s7
 ; CHECK-NEXT:    s_mov_b32 s0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 2
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 10
 ; CHECK-NEXT:    s_mov_b32 s6, s2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[0:1]
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 4
 ; CHECK-NEXT:    s_mov_b32 s3, s1
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s2, s0
 ; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[6:7]
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 4
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 6
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s3, s1
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s2, s0
 ; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 6
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[2:3]
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 8
 ; CHECK-NEXT:    s_mov_b32 s3, s1
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s2, s0
 ; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[2:3]
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 8
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 11
+; CHECK-NEXT:    v_add_f64 v[1:2], v[1:2], s[4:5]
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 11
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 11
 ; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
-; CHECK-NEXT:    ; kill: killed $vgpr2
+; CHECK-NEXT:    global_store_dwordx2 v[3:4], v[1:2], off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %0 = load i32, ptr addrspace(4) null, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d4941235680bf9..b178cf9ac52948 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -12,9 +12,10 @@ define internal fastcc void @widget() {
 ; GFX90A-NEXT:    s_mov_b32 s33, s32
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX90A-NEXT:    s_addk_i32 s32, 0x400
-; GFX90A-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX90A-NEXT:    v_writelane_b32 v41, s16, 0
 ; GFX90A-NEXT:    s_getpc_b64 s[16:17]
 ; GFX90A-NEXT:    s_add_u32 s16, s16, wobble at gotpcrel32@lo+4
 ; GFX90A-NEXT:    s_addc_u32 s17, s17, wobble at gotpcrel32@hi+12
@@ -34,12 +35,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v40, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v40, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v0, v42, s[76:77]
+; GLOBALNESS1-NEXT:    global_load_dword v0, v40, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
@@ -47,10 +48,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s78, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41]
 ; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
@@ -116,9 +117,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_15 Depth 2
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v43, v[0:1]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
+; GLOBALNESS1-NEXT:    buffer_store_dword v40, off, s[0:3], 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v46, v[0:1]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -127,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
@@ -170,13 +171,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_12
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS1-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v43, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[66:67], 0, v2
@@ -223,7 +224,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -233,15 +234,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[66:67]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v40
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[40:41], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_13
 ; GLOBALNESS1-NEXT:  .LBB1_23: ; %LeafBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -268,15 +269,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -291,7 +292,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -309,7 +310,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s12, s72
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -321,12 +322,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, v0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v40, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v40, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v0, v42, s[72:73]
+; GLOBALNESS0-NEXT:    global_load_dword v0, v40, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
@@ -334,10 +335,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41]
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
@@ -403,9 +404,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_15 Depth 2
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v43, v[0:1]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
+; GLOBALNESS0-NEXT:    buffer_store_dword v40, off, s[0:3], 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -414,7 +415,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
@@ -457,13 +458,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_12
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS0-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v43, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[66:67], 0, v2
@@ -510,7 +511,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -520,15 +521,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[66:67]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v40
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[40:41], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_13
 ; GLOBALNESS0-NEXT:  .LBB1_23: ; %LeafBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -555,15 +556,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v40
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[40:41], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -578,7 +579,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -596,7 +597,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s12, s70
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 9ca20fab201215..64fc82df8c99ca 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -10,8 +10,9 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 16
+; GCN-NEXT:    v_writelane_b32 v42, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
@@ -119,9 +120,10 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 16
+; GCN-NEXT:    v_readlane_b32 s4, v42, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -267,8 +269,9 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 28
+; GCN-NEXT:    v_writelane_b32 v46, s16, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
@@ -471,9 +474,10 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 28
+; GCN-NEXT:    v_readlane_b32 s4, v46, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 2e1b86f69a521f..240d6a17f62884 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -13,61 +13,47 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    global_load_ushort v3, v1, s[4:5] offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_load_ushort v3, v0, s[4:5] offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s4
-; CHECK-NEXT:    ds_write_b8 v1, v2
+; CHECK-NEXT:    ds_write_b8 v0, v2
 ; CHECK-NEXT:    s_mov_b64 s[4:5], exec
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
+; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3: ; %bb201
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, V2 at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, V2 at rel32@hi+12
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_short v0, v1, s[4:5]
+; CHECK-NEXT:    global_store_short v0, v2, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_barrier
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    ; divergent unreachable
 ; CHECK-NEXT:  .LBB0_4: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 bb:
   %i10 = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a718263..9448b287d17174 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -15,6 +15,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v15
@@ -35,7 +36,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v45, s4, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
@@ -54,9 +55,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v45, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -70,6 +72,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v36, v16
@@ -91,12 +94,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v45, s4, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -111,9 +114,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v45, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -127,7 +132,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:20
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
@@ -147,12 +154,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v45, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -165,9 +172,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v45, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:20
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -205,6 +214,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
@@ -218,7 +228,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v12
 ; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v46, s4, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
@@ -238,9 +248,10 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v46, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -254,6 +265,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
@@ -263,18 +275,18 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v46, s4, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v41, v16
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v41, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v15
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v43, v14
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v44, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v45, v12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -288,9 +300,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v46, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -304,7 +318,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:20
+; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:24
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
@@ -314,16 +330,16 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_store_b32 off, v45, s33
 ; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v46, s0, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
-; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
-; GFX11-NEXT:    v_mov_b32_e32 v45, v12
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_mov_b32_e32 v45, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -337,9 +353,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v46, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:20
+; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:24
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index c2b86a5135022a..09106f0da591a6 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -9,21 +9,17 @@ declare void @unknown_call()
 define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id !0 {
 ; CHECK-LABEL: kern:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s32, 0x200
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 s12, s12, s17
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3
-; CHECK-NEXT:    v_writelane_b32 v3, s16, 0
-; CHECK-NEXT:    s_or_saveexec_b32 s33, -1
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b32 exec_lo, s33
+; CHECK-NEXT:    v_writelane_b32 v40, s16, 0
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s12, s14
-; CHECK-NEXT:    v_readlane_b32 s14, v3, 0
+; CHECK-NEXT:    v_readlane_b32 s14, v40, 0
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[8:9]
 ; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[16:17], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 42
@@ -56,10 +52,6 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_or_saveexec_b32 s33, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s33
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
   store i32 42, ptr %addr
   call fastcc void @unknown_call()

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 2ddde62cdb6fc6..56d54a4c90fe17 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2856,23 +2856,26 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1032-NEXT:    s_mov_b32 s33, s32
 ; GFX1032-NEXT:    s_or_saveexec_b32 s17, -1
 ; GFX1032-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX1032-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s17
 ; GFX1032-NEXT:    s_addk_i32 s32, 0x200
-; GFX1032-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX1032-NEXT:    v_writelane_b32 v41, s16, 0
 ; GFX1032-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1032-NEXT:    s_add_u32 s16, s16, external_void_func_void at gotpcrel32@lo+4
 ; GFX1032-NEXT:    s_addc_u32 s17, s17, external_void_func_void at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GFX1032-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GFX1032-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX1032-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX1032-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX1032-NEXT:    s_clause 0x1
+; GFX1032-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX1032-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX1032-NEXT:    s_addk_i32 s32, 0xfe00
@@ -2887,23 +2890,26 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1064-NEXT:    s_mov_b32 s33, s32
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX1064-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX1064-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX1064-NEXT:    s_addk_i32 s32, 0x400
-; GFX1064-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX1064-NEXT:    v_writelane_b32 v41, s16, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1064-NEXT:    s_add_u32 s16, s16, external_void_func_void at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s17, s17, external_void_func_void at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GFX1064-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GFX1064-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX1064-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1064-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX1064-NEXT:    s_clause 0x1
+; GFX1064-NEXT:    buffer_load_dword v40, off, s[0:3], s33
+; GFX1064-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1064-NEXT:    s_addk_i32 s32, 0xfc00

diff  --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
deleted file mode 100644
index cb4e2e2bf5a472..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
-
-; The test forces a high vector register pressure and there won't be sufficient VGPRs to be allocated
-; for writelane/readlane SGPR spill instructions. Regalloc would split the vector register liverange
-; by introducing a copy to AGPR register. The VGPR store to AGPR (v_accvgpr_write_b32) and later the
-; restore from AGPR (v_accvgpr_read_b32) should be whole-wave operations and hence exec mask should be
-; manipulated to ensure all lanes are active when these instructions are executed.
-define void @vector_reg_liverange_split() #0 {
-; GFX90A-LABEL: vector_reg_liverange_split:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_mov_b32 s16, s33
-; GFX90A-NEXT:    s_mov_b32 s33, s32
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT:    s_mov_b64 exec, -1
-; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX90A-NEXT:    buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX90A-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX90A-NEXT:    ; implicit-def: $vgpr0
-; GFX90A-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX90A-NEXT:    s_addk_i32 s32, 0x400
-; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s20
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s20, 0
-; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v0
-; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
-; GFX90A-NEXT:    s_getpc_b64 s[16:17]
-; GFX90A-NEXT:    s_add_u32 s16, s16, foo at gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s17, s17, foo at gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a32
-; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
-; GFX90A-NEXT:    v_readlane_b32 s20, v0, 0
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s20
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX90A-NEXT:    ; kill: killed $vgpr0
-; GFX90A-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_mov_b64 exec, -1
-; GFX90A-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX90A-NEXT:    buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX90A-NEXT:    s_mov_b32 s33, s4
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-  %s20 = call i32 asm sideeffect "; def $0","=${s20}"()
-  call void @foo()
-  call void asm sideeffect "; use $0","${s20}"(i32 %s20)
-  ret void
-}
-
-declare void @foo()
-
-attributes #0 = { "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"}

diff  --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
deleted file mode 100644
index f92ab5740fac1b..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ /dev/null
@@ -1,140 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s
-
-; Test whole-wave register spilling.
-
-; In this testcase, the return address registers, PC value (SGPR30_SGPR31) and the scratch SGPR used in
-; the inline asm statements should be preserved across the call. Since the test limits the VGPR numbers,
-; the PC will be spilled to the only available CSR VGPR (VGPR40) as we spill CSR SGPRs including the PC
-; directly to the physical VGPR lane to correctly generate the CFIs. The SGPR20 will get spilled to the
-; virtual VGPR lane and that would be allocated by regalloc. Since there is no free VGPR to allocate, RA
-; must spill a scratch VGPR. The writelane/readlane instructions that spill/restore SGPRs into/from VGPR
-; are whole-wave operations and hence the VGPRs involved in such operations require whole-wave spilling.
-
-define void @test() #0 {
-; GCN-LABEL: test:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s16, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s28, 2
-; GCN-NEXT:    v_writelane_b32 v40, s29, 3
-; GCN-NEXT:    v_writelane_b32 v40, s16, 4
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s16
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s16, 0
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_getpc_b64 s[16:17]
-; GCN-NEXT:    s_add_u32 s16, s16, ext_func at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s17, s17, ext_func at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    global_store_dword v[0:1], v0, off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    v_readlane_b32 s28, v40, 2
-; GCN-NEXT:    v_readlane_b32 s29, v40, 3
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-O0-LABEL: test:
-; GCN-O0:       ; %bb.0:
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    s_mov_b32 s16, s33
-; GCN-O0-NEXT:    s_mov_b32 s33, s32
-; GCN-O0-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, -1
-; GCN-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-O0-NEXT:    v_writelane_b32 v40, s28, 2
-; GCN-O0-NEXT:    v_writelane_b32 v40, s29, 3
-; GCN-O0-NEXT:    v_writelane_b32 v40, s16, 4
-; GCN-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GCN-O0-NEXT:    ; implicit-def: $vgpr0
-; GCN-O0-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-O0-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-O0-NEXT:    ;;#ASMSTART
-; GCN-O0-NEXT:    ; def s16
-; GCN-O0-NEXT:    ;;#ASMEND
-; GCN-O0-NEXT:    v_writelane_b32 v0, s16, 0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-O0-NEXT:    s_getpc_b64 s[16:17]
-; GCN-O0-NEXT:    s_add_u32 s16, s16, ext_func at gotpcrel32@lo+4
-; GCN-O0-NEXT:    s_addc_u32 s17, s17, ext_func at gotpcrel32@hi+12
-; GCN-O0-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-O0-NEXT:    s_mov_b64 s[22:23], s[2:3]
-; GCN-O0-NEXT:    s_mov_b64 s[20:21], s[0:1]
-; GCN-O0-NEXT:    s_mov_b64 s[0:1], s[20:21]
-; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[22:23]
-; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-O0-NEXT:    global_store_dword v[1:2], v3, off
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
-; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
-; GCN-O0-NEXT:    v_readlane_b32 s29, v40, 3
-; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-O0-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, -1
-; GCN-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
-; GCN-O0-NEXT:    s_mov_b32 s33, s4
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
-  %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0
-  call void @ext_func()
-  store volatile i32 %sgpr, ptr addrspace(1) undef
-  ret void
-}
-
-declare void @ext_func();
-
-attributes #0 = { nounwind "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"}

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 7ce326359181b9..20098ce8bf6be2 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -141,15 +141,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0:       ; %bb.0: ; %entry
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
@@ -164,18 +159,17 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s34, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s35, 3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    s_nop 2
 ; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], s34
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -188,24 +182,21 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 s[34:35], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s35, 5
 ; GFX9-O0-NEXT:    s_and_b64 s[34:35], s[34:35], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -218,28 +209,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v0, 5
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v3, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v3, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v0, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v0, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v0, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v0, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v3, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v3, 3
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[36:37]
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b32 s36, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s36, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s36, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s36, 2
-; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s36
+; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s36
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s39
@@ -249,12 +237,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s34
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -559,28 +546,25 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s48, s33
+; GFX9-O0-NEXT:    s_mov_b32 s46, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr0
+; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xc00
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
@@ -598,10 +582,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s41, s45
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s40, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s41, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s42, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s43, 5
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
@@ -613,11 +597,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s34, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s35, 7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
@@ -634,20 +615,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v10, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v10, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v10, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v10, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v10, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v10, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -656,30 +630,27 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s48
+; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff400
+; GFX9-O0-NEXT:    s_mov_b32 s33, s46
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -938,7 +909,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -951,35 +922,35 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_writelane_b32 v42, s65, 1
 ; GFX9-O0-NEXT:    v_writelane_b32 v42, s66, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v42, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -989,145 +960,145 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s25
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s26
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v47
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
 ; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1256,7 +1227,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 5bcaee3e0bf512..667002509c04b8 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -125,8 +125,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0: v_add_u32_e64 v3, v3, v6
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
-; GFX9-O0: buffer_store_dword v1
-; GFX9-O3: buffer_store_dword v0
+; GFX9: buffer_store_dword v0
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
@@ -321,8 +320,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0: v_add_u32_e64 v3, v3, v6
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
-; GFX9-O0: buffer_store_dword v1
-; GFX9-O3: buffer_store_dword v0
+; GFX9: buffer_store_dword v0
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
@@ -408,4 +406,4 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32)
 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
-declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
\ No newline at end of file
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
index 240c60e72db21b..156891fef3625c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
@@ -3,7 +3,7 @@
 # contains not dead objects only. So using objects IDs as offset in the storage
 # caused out of bounds access.
 
-# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s
 
 # CHECK-LABEL: name:            foo
 # CHECK: {{^}}fixedStack:      []

diff  --git a/llvm/test/CodeGen/Mips/madd-msub.ll b/llvm/test/CodeGen/Mips/madd-msub.ll
index 9f7145685ed311..3a6fd68e9df2e2 100644
--- a/llvm/test/CodeGen/Mips/madd-msub.ll
+++ b/llvm/test/CodeGen/Mips/madd-msub.ll
@@ -42,22 +42,22 @@ define i64 @madd1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 ;
 ; 64-LABEL: madd1:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $4, $4, 0
-; 64-NEXT:    sll $5, $5, 0
-; 64-NEXT:    dmult $5, $4
+; 64-NEXT:    sll $1, $4, 0
+; 64-NEXT:    sll $2, $5, 0
+; 64-NEXT:    dmult $2, $1
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $6, $6, 0
+; 64-NEXT:    sll $2, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    daddu $2, $1, $6
+; 64-NEXT:    daddu $2, $1, $2
 ;
 ; 64R6-LABEL: madd1:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $4, $4, 0
-; 64R6-NEXT:    sll $5, $5, 0
-; 64R6-NEXT:    dmul $1, $5, $4
-; 64R6-NEXT:    sll $6, $6, 0
+; 64R6-NEXT:    sll $1, $4, 0
+; 64R6-NEXT:    sll $2, $5, 0
+; 64R6-NEXT:    dmul $1, $2, $1
+; 64R6-NEXT:    sll $2, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    daddu $2, $1, $6
+; 64R6-NEXT:    daddu $2, $1, $2
 ;
 ; 16-LABEL: madd1:
 ; 16:       # %bb.0: # %entry
@@ -173,18 +173,18 @@ define i64 @madd3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 ;
 ; 64-LABEL: madd3:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $4, $4, 0
-; 64-NEXT:    sll $5, $5, 0
-; 64-NEXT:    dmult $5, $4
+; 64-NEXT:    sll $1, $4, 0
+; 64-NEXT:    sll $2, $5, 0
+; 64-NEXT:    dmult $2, $1
 ; 64-NEXT:    mflo $1
 ; 64-NEXT:    jr $ra
 ; 64-NEXT:    daddu $2, $1, $6
 ;
 ; 64R6-LABEL: madd3:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $4, $4, 0
-; 64R6-NEXT:    sll $5, $5, 0
-; 64R6-NEXT:    dmul $1, $5, $4
+; 64R6-NEXT:    sll $1, $4, 0
+; 64R6-NEXT:    sll $2, $5, 0
+; 64R6-NEXT:    dmul $1, $2, $1
 ; 64R6-NEXT:    jr $ra
 ; 64R6-NEXT:    daddu $2, $1, $6
 ;
@@ -291,22 +291,22 @@ define i64 @msub1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 ;
 ; 64-LABEL: msub1:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $4, $4, 0
-; 64-NEXT:    sll $5, $5, 0
-; 64-NEXT:    dmult $5, $4
+; 64-NEXT:    sll $1, $4, 0
+; 64-NEXT:    sll $2, $5, 0
+; 64-NEXT:    dmult $2, $1
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $6, $6, 0
+; 64-NEXT:    sll $2, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    dsubu $2, $6, $1
+; 64-NEXT:    dsubu $2, $2, $1
 ;
 ; 64R6-LABEL: msub1:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $4, $4, 0
-; 64R6-NEXT:    sll $5, $5, 0
-; 64R6-NEXT:    dmul $1, $5, $4
-; 64R6-NEXT:    sll $6, $6, 0
+; 64R6-NEXT:    sll $1, $4, 0
+; 64R6-NEXT:    sll $2, $5, 0
+; 64R6-NEXT:    dmul $1, $2, $1
+; 64R6-NEXT:    sll $2, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    dsubu $2, $6, $1
+; 64R6-NEXT:    dsubu $2, $2, $1
 ;
 ; 16-LABEL: msub1:
 ; 16:       # %bb.0: # %entry
@@ -424,18 +424,18 @@ define i64 @msub3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 ;
 ; 64-LABEL: msub3:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $4, $4, 0
-; 64-NEXT:    sll $5, $5, 0
-; 64-NEXT:    dmult $5, $4
+; 64-NEXT:    sll $1, $4, 0
+; 64-NEXT:    sll $2, $5, 0
+; 64-NEXT:    dmult $2, $1
 ; 64-NEXT:    mflo $1
 ; 64-NEXT:    jr $ra
 ; 64-NEXT:    dsubu $2, $6, $1
 ;
 ; 64R6-LABEL: msub3:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $4, $4, 0
-; 64R6-NEXT:    sll $5, $5, 0
-; 64R6-NEXT:    dmul $1, $5, $4
+; 64R6-NEXT:    sll $1, $4, 0
+; 64R6-NEXT:    sll $2, $5, 0
+; 64R6-NEXT:    dmul $1, $2, $1
 ; 64R6-NEXT:    jr $ra
 ; 64R6-NEXT:    dsubu $2, $6, $1
 ;
@@ -546,22 +546,22 @@ define i64 @msub5(i32 %a, i32 %b, i32 %c) {
 ;
 ; 64-LABEL: msub5:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $4, $4, 0
-; 64-NEXT:    sll $5, $5, 0
-; 64-NEXT:    dmult $5, $4
+; 64-NEXT:    sll $1, $4, 0
+; 64-NEXT:    sll $2, $5, 0
+; 64-NEXT:    dmult $2, $1
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $6, $6, 0
+; 64-NEXT:    sll $2, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    dsubu $2, $1, $6
+; 64-NEXT:    dsubu $2, $1, $2
 ;
 ; 64R6-LABEL: msub5:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $4, $4, 0
-; 64R6-NEXT:    sll $5, $5, 0
-; 64R6-NEXT:    dmul $1, $5, $4
-; 64R6-NEXT:    sll $6, $6, 0
+; 64R6-NEXT:    sll $1, $4, 0
+; 64R6-NEXT:    sll $2, $5, 0
+; 64R6-NEXT:    dmul $1, $2, $1
+; 64R6-NEXT:    sll $2, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    dsubu $2, $1, $6
+; 64R6-NEXT:    dsubu $2, $1, $2
 ;
 ; 16-LABEL: msub5:
 ; 16:       # %bb.0: # %entry

diff  --git a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
index 0b8811f96bc804..9beff7b7c2dfa6 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
@@ -205,8 +205,8 @@ define void @PR20134_zext(ptr %a, i32 %i) {
 ; CHECK-NEXT:    addq %rdi, %rcx
 ; CHECK-NEXT:    movl (%rcx), %ecx
 ; CHECK-NEXT:    addl (%rax), %ecx
-; CHECK-NEXT:    movl %esi, %esi
-; CHECK-NEXT:    imulq $4, %rsi, %rax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    imulq $4, %rax, %rax
 ; CHECK-NEXT:    addq %rdi, %rax
 ; CHECK-NEXT:    movl %ecx, (%rax)
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index 6d817d89874d1f..222a4d78668b6e 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -106,24 +106,24 @@ define i96 @square_high(i96 %x) nounwind {
 ;
 ; X64-LABEL: square_high:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    movl %esi, %esi
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %r8, %rdx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    adcq $0, %rax
 ; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    imulq %rsi, %rsi
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    shrdq $32, %rsi, %r8
-; X64-NEXT:    shrq $32, %rsi
+; X64-NEXT:    adcq %rsi, %rax
+; X64-NEXT:    imulq %rcx, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    shrdq $32, %rcx, %r8
+; X64-NEXT:    shrq $32, %rcx
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    movq %rcx, %rdx
 ; X64-NEXT:    retq
 entry:
   %conv = zext i96 %x to i192

diff  --git a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
index 9a842a1e58d272..62abc4e035f4ac 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
@@ -34,8 +34,8 @@ entry:
 define i8 @t3(ptr %X, i64 %i) {
 ; CHECK-LABEL: t3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %esi
-; CHECK-NEXT:    movzbl (%rdi,%rsi,4), %eax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movzbl (%rdi,%rax,4), %eax
 ; CHECK-NEXT:    retq
 
 entry:

diff  --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index d41be734aa2d2b..eb6accd3e623b0 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -78,18 +78,18 @@ define i32 @out32_constmask(i32 %x, i32 %y) {
 define i64 @out64_constmask(i64 %x, i64 %y) {
 ; CHECK-NOBMI-LABEL: out64_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %edi
+; CHECK-NOBMI-NEXT:    movl %edi, %ecx
 ; CHECK-NOBMI-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; CHECK-NOBMI-NEXT:    andq %rsi, %rax
-; CHECK-NOBMI-NEXT:    orq %rdi, %rax
+; CHECK-NOBMI-NEXT:    orq %rcx, %rax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out64_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edi, %edi
+; CHECK-BMI-NEXT:    movl %edi, %ecx
 ; CHECK-BMI-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; CHECK-BMI-NEXT:    andq %rsi, %rax
-; CHECK-BMI-NEXT:    orq %rdi, %rax
+; CHECK-BMI-NEXT:    orq %rcx, %rax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i64 %x, 4294967295
   %my = and i64 %y, -4294967296


        


More information about the llvm-branch-commits mailing list