[llvm] ad2c66e - [AMDGPU] Optimize VGPR LiveRange in waterfall loops

Tue Jul 13 03:15:27 PDT 2021

Author: Sebastian Neubauer
Date: 2021-07-13T12:15:08+02:00
New Revision: ad2c66ec5d4bb0425625155bba966732ef85e6e5

URL: https://github.com/llvm/llvm-project/commit/ad2c66ec5d4bb0425625155bba966732ef85e6e5
DIFF: https://github.com/llvm/llvm-project/commit/ad2c66ec5d4bb0425625155bba966732ef85e6e5.diff

LOG: [AMDGPU] Optimize VGPR LiveRange in waterfall loops

The loops are run exactly once per lane, so VGPRs do not need to be
saved. Use the SIOptimizeVGPRLiveRange pass to add phi nodes that take
undef when coming from the loop.

There is still a shortcoming:
Return values from a function call in the loop are copied because their
live range conflicts with the live range of arguments, even if arguments
are only IMPLICIT_DEF after the phi insertion.

Differential Revision: https://reviews.llvm.org/D105192

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
    llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
    llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index acfee63d203a..e357411d74b6 100644

--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This pass tries to remove unnecessary VGPR live range in divergent if-else
-/// structure.
+/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else
+/// structures and waterfall loops.
 ///
-/// When we do structurization, we usually transform a if-else into two
+/// When we do structurization, we usually transform an if-else into two
 /// sucessive if-then (with a flow block to do predicate inversion). Consider a
 /// simple case after structurization: A divergent value %a was defined before
 /// if-else and used in both THEN (use in THEN is optional) and ELSE part:
@@ -29,10 +29,10 @@
 ///
 ///  As register allocator has no idea of the thread-control-flow, it will just
 ///  assume %a would be alive in the whole range of bb.then because of a later
-///  use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
+///  use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
 ///  to exec mask. For this if-else case, the lanes active in bb.then will be
-///  inactive in bb.else, and vice-verse. So we are safe to say that %a was dead
-///  after the last use in bb.then untill the end of the block. The reason is
+///  inactive in bb.else, and vice-versa. So we are safe to say that %a was dead
+///  after the last use in bb.then until the end of the block. The reason is
 ///  the instructions in bb.then will only overwrite lanes that will never be
 ///  accessed in bb.else.
 ///
@@ -46,6 +46,28 @@
 ///      sure the second loop iteration still get correct data.
 ///  2.) There should be no further uses after the IF-ELSE region.
 ///
+///
+/// Waterfall loops get inserted around instructions that use divergent values
+/// but can only be executed with a uniform value. For example an indirect call
+/// to a divergent address:
+///    bb.start:
+///      %a = ...
+///      %fun = ...
+///      ...
+///    bb.loop:
+///      call %fun (%a)
+///      ... // %a can be dead here
+///      loop %bb.loop
+///
+///  The loop block is executed multiple times, but it is run exactly once for
+///  each active lane. Similar to the if-else case, the register allocator
+///  assumes that %a is live throughout the loop as it is used again in the next
+///  iteration. If %a is a VGPR that is unused after the loop, it does not need
+///  to be live after its last use in the loop block. By inserting a phi-node at
+///  the start of bb.loop that is undef when coming from bb.loop, the register
+///  allocation knows that the value of %a does not need to be preserved through
+///  iterations of the loop.
+///
 //
 //===----------------------------------------------------------------------===//
 
@@ -89,6 +111,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
                             SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
                             SmallVectorImpl<Register> &CandidateRegs) const;
 
+  void collectWaterfallCandidateRegisters(
+      MachineBasicBlock *Loop,
+      SmallSetVector<Register, 16> &CandidateRegs) const;
+
   void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
                              SmallVectorImpl<MachineInstr *> &Uses) const;
 
@@ -105,6 +131,8 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
                     MachineBasicBlock *Flow, MachineBasicBlock *Endif,
                     SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
 
+  void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+
   SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -278,6 +306,54 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
   }
 }
 
+/// Collect the registers used in the waterfall loop block that are defined
+/// before.
+void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
+    MachineBasicBlock *Loop,
+    SmallSetVector<Register, 16> &CandidateRegs) const {
+
+  for (auto &MI : Loop->instrs()) {
+    if (MI.isDebugInstr())
+      continue;
+
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.getReg() || MO.isDef())
+        continue;
+
+      Register MOReg = MO.getReg();
+      // We can only optimize AGPR/VGPR virtual register
+      if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+        continue;
+
+      if (MO.readsReg()) {
+        const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+        // Make sure the value is defined before the LOOP block
+        if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+          // If the variable is used after the loop, the register coalescer will
+          // merge the newly created register and remove the phi node again.
+          // Just do nothing in that case.
+          LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
+          bool IsUsed = false;
+          for (auto *Succ : Loop->successors()) {
+            if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+              IsUsed = true;
+              break;
+            }
+          }
+          if (!IsUsed) {
+            LLVM_DEBUG(dbgs() << "Found candidate reg: "
+                              << printReg(MOReg, TRI, 0, MRI) << '\n');
+            CandidateRegs.insert(MOReg);
+          } else {
+            LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: "
+                              << printReg(MOReg, TRI, 0, MRI) << '\n');
+          }
+        }
+      }
+    }
+  }
+}
+
 // Re-calculate the liveness of \p Reg in the THEN-region
 void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
     Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
@@ -403,12 +479,8 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
   }
 
   // Replace all uses in the ELSE region or the PHIs in ENDIF block
-  for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) {
-    MachineOperand &O = *I;
-    // This is a little bit tricky, the setReg() will update the linked list,
-    // so we have to increment the iterator before setReg() to avoid skipping
-    // some uses.
-    ++I;
+  // Use early increment range because setReg() will update the linked list.
+  for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
     auto *UseMI = O.getParent();
     auto *UseBlock = UseMI->getParent();
     // Replace uses in Endif block
@@ -431,6 +503,53 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
   updateLiveRangeInThenRegion(Reg, If, Flow);
 }
 
+void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
+    Register Reg, MachineBasicBlock *Loop) const {
+  // Insert a new PHI, marking the value from the last loop iteration undef.
+  LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+  const auto *RC = MRI->getRegClass(Reg);
+  Register NewReg = MRI->createVirtualRegister(RC);
+  Register UndefReg = MRI->createVirtualRegister(RC);
+
+  // Replace all uses in the LOOP region
+  // Use early increment range because setReg() will update the linked list.
+  for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+    auto *UseMI = O.getParent();
+    auto *UseBlock = UseMI->getParent();
+    // Replace uses in Loop block
+    if (UseBlock == Loop)
+      O.setReg(NewReg);
+  }
+
+  MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
+                                    TII->get(TargetOpcode::PHI), NewReg);
+  for (auto *Pred : Loop->predecessors()) {
+    if (Pred == Loop)
+      PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+    else
+      PHI.addReg(Reg).addMBB(Pred);
+  }
+
+  LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+  LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+  // collectWaterfallCandidateRegisters only collects registers that are dead
+  // after the loop. So we know that the old reg is not live throughout the
+  // whole block anymore.
+  OldVarInfo.AliveBlocks.reset(Loop->getNumber());
+
+  // Mark the last use as kill
+  for (auto &MI : reverse(Loop->instrs())) {
+    if (MI.readsRegister(NewReg, TRI)) {
+      MI.addRegisterKilled(NewReg, TRI);
+      NewVarInfo.Kills.push_back(&MI);
+      break;
+    }
+  }
+  assert(!NewVarInfo.Kills.empty() &&
+         "Failed to find last usage of register in loop");
+}
+
 char SIOptimizeVGPRLiveRange::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -491,6 +610,16 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
         // Now we are safe to optimize.
         for (auto Reg : CandidateRegs)
           optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+      } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+        LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
+                          << printMBBReference(MBB) << '\n');
+
+        SmallSetVector<Register, 16> CandidateRegs;
+        collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+        MadeChange |= !CandidateRegs.empty();
+        // Now we are safe to optimize.
+        for (auto Reg : CandidateRegs)
+          optimizeWaterfallLiveRange(Reg, &MBB);
       }
     }
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
index 0c727e64b2e6..f76b79b22e18 100644
--- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
@@ -24,6 +24,8 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN-NEXT: s_nop 0
 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
 ; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]]
 define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@@ -48,6 +50,8 @@ main_body:
 ; GCN-NEXT: s_nop 0
 
 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
 ; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]]
 define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) {

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 707557b52398..7a735b48a696 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -202,32 +202,28 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -235,13 +231,11 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -250,36 +244,34 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -292,32 +284,28 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -325,13 +313,11 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
@@ -341,36 +327,34 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -383,32 +367,28 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -416,13 +396,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -431,37 +409,36 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB4_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -475,32 +452,28 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 19
+; GCN-NEXT:    v_writelane_b32 v40, s33, 19
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s50, 15
-; GCN-NEXT:    v_writelane_b32 v43, s51, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s50, 15
+; GCN-NEXT:    v_writelane_b32 v40, s51, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -508,20 +481,18 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
 ; GCN-NEXT:    s_cbranch_execz BB5_4
 ; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    v_writelane_b32 v43, s30, 17
-; GCN-NEXT:    v_writelane_b32 v43, s31, 18
+; GCN-NEXT:    v_writelane_b32 v40, s30, 17
+; GCN-NEXT:    v_writelane_b32 v40, s31, 18
 ; GCN-NEXT:    s_mov_b64 s[48:49], exec
 ; GCN-NEXT:  BB5_2: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[50:51], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -530,40 +501,38 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[50:51]
 ; GCN-NEXT:    s_cbranch_execnz BB5_2
 ; GCN-NEXT:  ; %bb.3:
 ; GCN-NEXT:    s_mov_b64 exec, s[48:49]
-; GCN-NEXT:    v_readlane_b32 s30, v43, 17
-; GCN-NEXT:    v_readlane_b32 s31, v43, 18
+; GCN-NEXT:    v_readlane_b32 s30, v40, 17
+; GCN-NEXT:    v_readlane_b32 s31, v40, 18
 ; GCN-NEXT:  BB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s51, v43, 16
-; GCN-NEXT:    v_readlane_b32 s50, v43, 15
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 19
+; GCN-NEXT:    v_readlane_b32 s51, v40, 16
+; GCN-NEXT:    v_readlane_b32 s50, v40, 15
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 19
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -583,48 +552,145 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v42, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 6
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v42, s34, 0
-; GCN-NEXT:    v_writelane_b32 v42, s35, 1
-; GCN-NEXT:    v_writelane_b32 v42, s36, 2
-; GCN-NEXT:    v_writelane_b32 v42, s37, 3
-; GCN-NEXT:    v_writelane_b32 v42, s30, 4
-; GCN-NEXT:    v_writelane_b32 v42, s31, 5
-; GCN-NEXT:    v_mov_b32_e32 v41, v1
-; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 4
+; GCN-NEXT:    v_writelane_b32 v40, s31, 5
 ; GCN-NEXT:    s_mov_b64 s[34:35], exec
 ; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v40
-; GCN-NEXT:    v_readfirstlane_b32 s7, v41
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41]
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
 ; GCN-NEXT:    s_cbranch_execnz BB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_readlane_b32 s4, v42, 4
-; GCN-NEXT:    v_readlane_b32 s5, v42, 5
-; GCN-NEXT:    v_readlane_b32 s37, v42, 3
-; GCN-NEXT:    v_readlane_b32 s36, v42, 2
-; GCN-NEXT:    v_readlane_b32 s35, v42, 1
-; GCN-NEXT:    v_readlane_b32 s34, v42, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
+; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v42, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 6
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
   call amdgpu_gfx void %fptr(i32 inreg 123)
   ret void
 }
+
+define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v41, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s30, 4
+; GCN-NEXT:    v_writelane_b32 v41, s31, 5
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NEXT:    v_readfirstlane_b32 s5, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB7_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    v_readlane_b32 s4, v41, 4
+; GCN-NEXT:    v_readlane_b32 s5, v41, 5
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v41, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void %fptr(i32 %i)
+  ret i32 %i
+}
+
+; Use a variable inside a waterfall loop and use the return variable after the loop.
+; TODO The argument and return variable could be in the same physical register, but the register
+; allocator is not able to do that because the return value clashes with the liverange of an
+; IMPLICIT_DEF of the argument.
+define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 4
+; GCN-NEXT:    v_writelane_b32 v40, s31, 5
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NEXT:    v_readfirstlane_b32 s5, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB8_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
+; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  %ret = call amdgpu_gfx i32 %fptr(i32 %i)
+  ret i32 %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index 18d22e39710e..d9963b90adc5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -18,6 +18,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
 ; GFX10-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX10-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execnz BB0_1
@@ -44,6 +46,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX9-NEXT:    ; implicit-def: $vgpr4
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz BB0_1
 ; GFX9-NEXT:  ; %bb.2:
@@ -68,6 +72,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
 ; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz BB0_1
 ; GFX8-NEXT:  ; %bb.2:

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index e62aa9771198..2f3007d06ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -13,19 +13,20 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
 ; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
-; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[6:7]
+; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:  BB0_2: ; Parent Loop BB0_1 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s8, v4
-; GCN-NEXT:    v_readfirstlane_b32 s9, v5
-; GCN-NEXT:    v_readfirstlane_b32 s10, v2
-; GCN-NEXT:    v_readfirstlane_b32 s11, v3
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
-; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GCN-NEXT:    v_readfirstlane_b32 s8, v2
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    v_readfirstlane_b32 s10, v4
+; GCN-NEXT:    v_readfirstlane_b32 s11, v5
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
 ; GCN-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[8:11], 0 offen
 ; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s4