[llvm] 1d286ad - [AMDGPU] Add mark last scratch load pass (#75512)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 18 00:36:48 PST 2024


Author: Mirko BrkuĊĦanin
Date: 2024-01-18T09:36:44+01:00
New Revision: 1d286ad59b9080d9503502a35f9bdb35e40f1f33

URL: https://github.com/llvm/llvm-project/commit/1d286ad59b9080d9503502a35f9bdb35e40f1f33
DIFF: https://github.com/llvm/llvm-project/commit/1d286ad59b9080d9503502a35f9bdb35e40f1f33.diff

LOG: [AMDGPU] Add mark last scratch load pass (#75512)

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
    llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
    llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
    llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 35d33cb60bc47c..36af767a70b0a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID;
 void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
 extern char &AMDGPUGlobalISelDivergenceLoweringID;
 
+void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &);
+extern char &AMDGPUMarkLastScratchLoadID;
+
 void initializeSILowerSGPRSpillsPass(PassRegistry &);
 extern char &SILowerSGPRSpillsID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
new file mode 100644
index 00000000000000..0692a12a406112
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
@@ -0,0 +1,142 @@
+//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Mark scratch load/spill instructions which are guaranteed to be the last time
+// this scratch slot is used so it can be evicted from caches.
+//
+// TODO: Handle general stack accesses not just spilling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineOperand.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-last-scratch-load"
+
+namespace {
+
+class AMDGPUMarkLastScratchLoad : public MachineFunctionPass {
+private:
+  LiveStacks *LS = nullptr;
+  LiveIntervals *LIS = nullptr;
+  SlotIndexes *SI = nullptr;
+  const SIInstrInfo *SII = nullptr;
+
+public:
+  static char ID;
+
+  AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) {
+    initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<SlotIndexes>();
+    AU.addRequired<LiveIntervals>();
+    AU.addRequired<LiveStacks>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Mark Last Scratch Load";
+  }
+};
+
+} // end anonymous namespace
+
+bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (ST.getGeneration() < AMDGPUSubtarget::GFX12)
+    return false;
+
+  LS = &getAnalysis<LiveStacks>();
+  LIS = &getAnalysis<LiveIntervals>();
+  SI = &getAnalysis<SlotIndexes>();
+  SII = ST.getInstrInfo();
+  SlotIndexes &Slots = *LIS->getSlotIndexes();
+
+  const unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots == 0) {
+    LLVM_DEBUG(dbgs() << "No live slots, skipping\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n");
+
+  bool Changed = false;
+
+  for (auto &[SS, LI] : *LS) {
+    for (const LiveRange::Segment &Segment : LI.segments) {
+
+      // Ignore segments that run to the end of basic block because in this case
+      // slot is still live at the end of it.
+      if (Segment.end.isBlock())
+        continue;
+
+      const int FrameIndex = Register::stackSlot2Index(LI.reg());
+      MachineInstr *LastLoad = nullptr;
+
+      MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end);
+
+      // If there is no instruction at this slot because it was deleted take the
+      // instruction from the next slot.
+      if (!MISegmentEnd) {
+        SlotIndex NextSlot = Slots.getNextNonNullIndex(Segment.end);
+        MISegmentEnd = SI->getInstructionFromIndex(NextSlot);
+      }
+
+      MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start);
+      MachineBasicBlock *BB = MISegmentEnd->getParent();
+
+      // Start iteration backwards from segment end until the start of basic
+      // block or start of segment if it is in the same basic block.
+      auto End = BB->rend();
+      if (MISegmentStart && MISegmentStart->getParent() == BB)
+        End = MISegmentStart->getReverseIterator();
+
+      for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) {
+        int LoadFI = 0;
+
+        if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) {
+          LastLoad = &*MI;
+          break;
+        }
+      }
+
+      if (LastLoad && !LastLoad->memoperands_empty()) {
+        MachineMemOperand *MMO = *LastLoad->memoperands_begin();
+        MMO->setFlags(MOLastUse);
+        Changed = true;
+        LLVM_DEBUG(dbgs() << "  Found last load: " << *LastLoad);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+char AMDGPUMarkLastScratchLoad::ID = 0;
+
+char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
+                      "AMDGPU Mark last scratch load", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
+                    "AMDGPU Mark last scratch load", false, false)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0f3bb3e7b0d8d0..b8a7a5e2080213 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerI1CopiesPass(*PR);
   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
   initializeSILowerWWMCopiesPass(*PR);
+  initializeAMDGPUMarkLastScratchLoadPass(*PR);
   initializeSILowerSGPRSpillsPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
@@ -1424,6 +1425,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   addPreRewrite();
   addPass(&VirtRegRewriterID);
 
+  addPass(&AMDGPUMarkLastScratchLoadID);
+
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 2c92e7a0738855..9a974eaf50d235 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUInsertSingleUseVDST.cpp
+  AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aa98a4b860dda9..93fdcefc041368 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8756,6 +8756,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
       {
           {MONoClobber, "amdgpu-noclobber"},
+          {MOLastUse, "amdgpu-last-use"},
       };
 
   return ArrayRef(TargetFlags);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index daef6031dd0748..f3fe0a85e63f9a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -41,6 +41,10 @@ class ScheduleHazardRecognizer;
 static const MachineMemOperand::Flags MONoClobber =
     MachineMemOperand::MOTargetFlag1;
 
+/// Mark the MMO of a load as the last use.
+static const MachineMemOperand::Flags MOLastUse =
+    MachineMemOperand::MOTargetFlag2;
+
 /// Utility to store machine instructions worklist.
 struct SIInstrWorklist {
   SIInstrWorklist() = default;

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a93cf5cad411f3..a2cacb5cbaa393 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1657,8 +1657,12 @@ void SIRegisterInfo::buildSpillLoadStore(
     } else {
       MIB.addReg(SOffset, SOffsetRegState);
     }
-    MIB.addImm(Offset + RegOffset)
-       .addImm(0); // cpol
+
+    MIB.addImm(Offset + RegOffset);
+
+    bool LastUse = MMO->getFlags() & MOLastUse;
+    MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
+
     if (!IsFlat)
       MIB.addImm(0); // swz
     MIB.addMemOperand(NewMMO);
@@ -2241,6 +2245,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
                                   RS->isRegUsed(AMDGPU::SCC));
       }
+
       buildSpillLoadStore(
           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 8b0b6263832243..48f00a82e3e1c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -359,6 +359,7 @@
 ; GCN-O1-NEXT:        SI Lower WWM Copies
 ; GCN-O1-NEXT:        GCN NSA Reassign
 ; GCN-O1-NEXT:        Virtual Register Rewriter
+; GCN-O1-NEXT:        AMDGPU Mark Last Scratch Load
 ; GCN-O1-NEXT:        Stack Slot Coloring
 ; GCN-O1-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-NEXT:        Machine Loop Invariant Code Motion
@@ -655,6 +656,7 @@
 ; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT:        AMDGPU Mark Last Scratch Load
 ; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
 ; GCN-O1-OPTS-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-OPTS-NEXT:        Machine Loop Invariant Code Motion
@@ -957,6 +959,7 @@
 ; GCN-O2-NEXT:        SI Lower WWM Copies
 ; GCN-O2-NEXT:        GCN NSA Reassign
 ; GCN-O2-NEXT:        Virtual Register Rewriter
+; GCN-O2-NEXT:        AMDGPU Mark Last Scratch Load
 ; GCN-O2-NEXT:        Stack Slot Coloring
 ; GCN-O2-NEXT:        Machine Copy Propagation Pass
 ; GCN-O2-NEXT:        Machine Loop Invariant Code Motion
@@ -1271,6 +1274,7 @@
 ; GCN-O3-NEXT:        SI Lower WWM Copies
 ; GCN-O3-NEXT:        GCN NSA Reassign
 ; GCN-O3-NEXT:        Virtual Register Rewriter
+; GCN-O3-NEXT:        AMDGPU Mark Last Scratch Load
 ; GCN-O3-NEXT:        Stack Slot Coloring
 ; GCN-O3-NEXT:        Machine Copy Propagation Pass
 ; GCN-O3-NEXT:        Machine Loop Invariant Code Motion

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 7c8507fe4559fb..17a19116735e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -25,6 +25,7 @@
 ; DEFAULT-NEXT: SI Lower WWM Copies
 ; DEFAULT-NEXT: GCN NSA Reassign
 ; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
 ; DEFAULT-NEXT: Stack Slot Coloring
 
 ; O0: Fast Register Allocator
@@ -61,6 +62,7 @@
 ; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
 ; BASIC-DEFAULT-NEXT: GCN NSA Reassign
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
 ; BASIC-DEFAULT-NEXT: Stack Slot Coloring
 
 
@@ -75,6 +77,7 @@
 ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
 ; DEFAULT-BASIC-NEXT: Stack Slot Coloring
 
 
@@ -95,6 +98,7 @@
 ; BASIC-BASIC-NEXT: SI Lower WWM Copies
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
 ; BASIC-BASIC-NEXT: Stack Slot Coloring
 
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
new file mode 100644
index 00000000000000..ab112e606c0a82
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s
+
+define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
+; CHECK-LABEL: max_6_vgprs:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; CHECK-NEXT:    global_load_b32 v5, v[0:1], off th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:160 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    global_store_b32 v[0:1], v5, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT:    s_endpgm
+  %tid = load volatile i32, ptr addrspace(1) undef
+  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
+  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
+  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
+  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
+  %v1 = load volatile i32, ptr addrspace(1) %p1
+  %v2 = load volatile i32, ptr addrspace(1) %p2
+  %v3 = load volatile i32, ptr addrspace(1) %p3
+  %v4 = load volatile i32, ptr addrspace(1) %p4
+  %v5 = load volatile i32, ptr addrspace(1) %p5
+  call void asm sideeffect "", "~{v[0:4]}" ()
+  store volatile i32 %v1, ptr addrspace(1) undef
+  store volatile i32 %v2, ptr addrspace(1) undef
+  store volatile i32 %v3, ptr addrspace(1) undef
+  store volatile i32 %v4, ptr addrspace(1) undef
+  store volatile i32 %v5, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" {
+; CHECK-LABEL: max_11_vgprs_branch:
+; CHECK:       ; %bb.0: ; %.entry
+; CHECK-NEXT:    global_load_b32 v3, v[0:1], off th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s0, exec_lo
+; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_lshlrev_b64_e32 v[3:4], 2, v[3:4]
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:336 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:448 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:576 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:720 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; CHECK-NEXT:    s_xor_b32 s0, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_execz .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %.false
+; CHECK-NEXT:    global_load_b32 v10, v[0:1], off th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; kill: killed $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; kill: killed $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:  .LBB1_2: ; %Flow
+; CHECK-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; CHECK-NEXT:    s_cbranch_execz .LBB1_4
+; CHECK-NEXT:  ; %bb.3: ; %.true
+; CHECK-NEXT:    global_load_b32 v10, v[0:1], off th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 th:TH_LOAD_RT_NT
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:  .LBB1_4: ; %.exit
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT:    s_endpgm
+.entry:
+  %tid = load volatile i32, ptr addrspace(1) undef
+  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
+  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
+  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
+  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
+  %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20
+  %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24
+  %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28
+  %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32
+  %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36
+  %v7 = load volatile i32, ptr addrspace(1) %p7
+  %v8 = load volatile i32, ptr addrspace(1) %p8
+  %v9 = load volatile i32, ptr addrspace(1) %p9
+  %v10 = load volatile i32, ptr addrspace(1) %p10
+  %cmp = icmp ne i32 %tmp, 0
+  br i1 %cmp, label %.true, label %.false
+
+.true:
+  %v1_t = load volatile i32, ptr addrspace(1) %p1
+  %v2_t = load volatile i32, ptr addrspace(1) %p2
+  %v3_t = load volatile i32, ptr addrspace(1) %p3
+  %v4_t = load volatile i32, ptr addrspace(1) %p4
+  %v5_t = load volatile i32, ptr addrspace(1) %p5
+  %v6_t = load volatile i32, ptr addrspace(1) %p6
+  call void asm sideeffect "", "~{v[0:9]}" ()
+  store volatile i32 %v1_t, ptr addrspace(1) undef
+  store volatile i32 %v2_t, ptr addrspace(1) undef
+  store volatile i32 %v3_t, ptr addrspace(1) undef
+  store volatile i32 %v4_t, ptr addrspace(1) undef
+  store volatile i32 %v5_t, ptr addrspace(1) undef
+  store volatile i32 %v6_t, ptr addrspace(1) undef
+  store volatile i32 %v7, ptr addrspace(1) undef
+  store volatile i32 %v8, ptr addrspace(1) undef
+
+  br label %.exit
+
+.false:
+  %v1_f = load volatile i32, ptr addrspace(1) %p1
+  %v2_f = load volatile i32, ptr addrspace(1) %p2
+  %v3_f = load volatile i32, ptr addrspace(1) %p3
+  %v4_f = load volatile i32, ptr addrspace(1) %p4
+  %v5_f = load volatile i32, ptr addrspace(1) %p5
+  %v6_f = load volatile i32, ptr addrspace(1) %p6
+  call void asm sideeffect "", "~{v[0:9]}" ()
+  store volatile i32 %v1_f, ptr addrspace(1) undef
+  store volatile i32 %v2_f, ptr addrspace(1) undef
+  store volatile i32 %v3_f, ptr addrspace(1) undef
+  store volatile i32 %v4_f, ptr addrspace(1) undef
+  store volatile i32 %v5_f, ptr addrspace(1) undef
+  store volatile i32 %v6_f, ptr addrspace(1) undef
+  store volatile i32 %v7, ptr addrspace(1) undef
+  store volatile i32 %v8, ptr addrspace(1) undef
+
+  br label %.exit
+
+.exit:
+  store volatile i32 %v9, ptr addrspace(1) undef
+  store volatile i32 %v10, ptr addrspace(1) undef
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir
new file mode 100644
index 00000000000000..ab80b7d20984be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir
@@ -0,0 +1,303 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -o - %s -run-pass=greedy -run-pass=amdgpu-mark-last-scratch-load | FileCheck -check-prefix=CHECK %s
+
+--- |
+  define amdgpu_cs void @test_spill_12x32() "amdgpu-num-vgpr"="12" {
+    ret void
+  }
+  define amdgpu_cs void @test_spill_384() "amdgpu-num-vgpr"="12" {
+    ret void
+  }
+  define amdgpu_ps void @test_loop_12() "amdgpu-num-vgpr"="12" {
+    ret void
+  }
+...
+---
+name: test_spill_12x32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+
+    ; CHECK-LABEL: name: test_spill_12x32
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr1, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr2, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr3, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr4, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr5, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr7, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr8, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr9, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr10, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr11, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.3, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.5, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.6, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.7, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.8, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.9, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.10, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.11, addrspace 5)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V32_RESTORE]], implicit [[SI_SPILL_V32_RESTORE1]], implicit [[SI_SPILL_V32_RESTORE2]], implicit [[SI_SPILL_V32_RESTORE3]], implicit [[SI_SPILL_V32_RESTORE4]], implicit [[SI_SPILL_V32_RESTORE5]], implicit [[SI_SPILL_V32_RESTORE6]], implicit [[SI_SPILL_V32_RESTORE7]], implicit [[SI_SPILL_V32_RESTORE8]], implicit [[SI_SPILL_V32_RESTORE9]], implicit [[SI_SPILL_V32_RESTORE10]], implicit [[SI_SPILL_V32_RESTORE11]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    %3:vgpr_32 = COPY $vgpr3
+    %4:vgpr_32 = COPY $vgpr4
+    %5:vgpr_32 = COPY $vgpr5
+    %6:vgpr_32 = COPY $vgpr6
+    %7:vgpr_32 = COPY $vgpr7
+    %8:vgpr_32 = COPY $vgpr8
+    %9:vgpr_32 = COPY $vgpr9
+    %10:vgpr_32 = COPY $vgpr10
+    %11:vgpr_32 = COPY $vgpr11
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    S_ENDPGM 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11
+...
+
+---
+name: test_spill_384
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+
+    ; CHECK-LABEL: name: test_spill_384
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: SI_SPILL_V384_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, %stack.0, $sp_reg, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    ; CHECK-NEXT: [[SI_SPILL_V384_RESTORE:%[0-9]+]]:vreg_384 = SI_SPILL_V384_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s384) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V384_RESTORE]]
+    %0:vreg_384 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+    S_ENDPGM 0, implicit %0
+...
+
+---
+name: test_loop_12
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test_loop_12
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr12, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr11, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr10, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr9, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr8, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr7, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr5, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr4, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr3, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr2, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr1, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr0, %stack.12, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_3]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+  ; CHECK-NEXT:   %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+  ; CHECK-NEXT:   %vcmp:sreg_32 = V_CMP_LT_I32_e64 0, [[SI_SPILL_V32_RESTORE]], implicit $exec
+  ; CHECK-NEXT:   %mask:sreg_32 = COPY $exec_lo, implicit-def $exec_lo
+  ; CHECK-NEXT:   %sand:sreg_32 = S_AND_B32 %mask, %vcmp, implicit-def dead $scc
+  ; CHECK-NEXT:   $exec_lo = S_MOV_B32_term %sand
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %mask2:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   %count:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec_lo = S_OR_B32 $exec_lo, %mask, implicit-def $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_1]], %stack.14, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_]], 0, [[SI_SPILL_V32_RESTORE1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.14, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_]], %stack.13, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_1]], 0, [[SI_SPILL_V32_RESTORE2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_2]], 0, [[SI_SPILL_V32_RESTORE3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_2]], %stack.15, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_4]], 0, [[SI_SPILL_V32_RESTORE4]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+  ; CHECK-NEXT:   %res5:vgpr_32 = V_ADD_F32_e64 0, %res5, 0, [[SI_SPILL_V32_RESTORE5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+  ; CHECK-NEXT:   %res6:vgpr_32 = V_ADD_F32_e64 0, %res6, 0, [[SI_SPILL_V32_RESTORE6]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+  ; CHECK-NEXT:   %res7:vgpr_32 = V_ADD_F32_e64 0, %res7, 0, [[SI_SPILL_V32_RESTORE7]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+  ; CHECK-NEXT:   %res8:vgpr_32 = V_ADD_F32_e64 0, %res8, 0, [[SI_SPILL_V32_RESTORE8]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   %res9:vgpr_32 = V_ADD_F32_e64 0, %res9, 0, [[SI_SPILL_V32_RESTORE9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; CHECK-NEXT:   %res10:vgpr_32 = V_ADD_F32_e64 0, %res10, 0, [[SI_SPILL_V32_RESTORE10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   %res11:vgpr_32 = V_ADD_F32_e64 0, %res11, 0, [[SI_SPILL_V32_RESTORE11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE12:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   %res12:vgpr_32 = V_ADD_F32_e64 0, %res12, 0, [[SI_SPILL_V32_RESTORE12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+  ; CHECK-NEXT:   %count:sgpr_32 = nuw nsw S_ADD_I32 %count, 1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE13:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+  ; CHECK-NEXT:   %vcmp2:sreg_32 = V_CMP_GE_I32_e64 %count, [[SI_SPILL_V32_RESTORE13]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.15, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+  ; CHECK-NEXT:   %mask2:sgpr_32 = S_OR_B32 %vcmp2, %mask2, implicit-def $scc
+  ; CHECK-NEXT:   $exec_lo = S_ANDN2_B32_term $exec_lo, %mask2, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec_lo = S_OR_B32 $exec_lo, %mask2, implicit-def $scc
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_4]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE14:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.16, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.16, addrspace 5)
+  ; CHECK-NEXT:   EXP_DONE 0, [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_2]], [[SI_SPILL_V32_RESTORE14]], -1, 0, 15, implicit $exec
+  ; CHECK-NEXT:   EXP_DONE 0, %res5, %res6, %res7, %res8, -1, 0, 15, implicit $exec
+  ; CHECK-NEXT:   EXP_DONE 0, %res9, %res10, %res11, %res12, -1, 0, 15, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0: ; entry
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
+
+    %12:vgpr_32 = COPY $vgpr12
+    %11:vgpr_32 = COPY $vgpr11
+    %10:vgpr_32 = COPY $vgpr10
+    %9:vgpr_32 = COPY $vgpr9
+    %8:vgpr_32 = COPY $vgpr8
+    %7:vgpr_32 = COPY $vgpr7
+    %6:vgpr_32 = COPY $vgpr6
+    %5:vgpr_32 = COPY $vgpr5
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %loop_end:vgpr_32 = COPY $vgpr0
+    %res1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %vcmp:sreg_32 = V_CMP_LT_I32_e64 0, %loop_end, implicit $exec
+    %mask:sreg_32 = COPY $exec_lo, implicit-def $exec_lo
+    %sand:sreg_32 = S_AND_B32 %mask, %vcmp, implicit-def dead $scc
+    $exec_lo = S_MOV_B32_term %sand
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1: ; loop preheader
+    successors: %bb.3(0x80000000)
+
+    %mask2:sgpr_32 = S_MOV_B32 0
+    %count:sgpr_32 = S_MOV_B32 0
+    %res1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2: ; flow
+    successors: %bb.5(0x80000000)
+
+    $exec_lo = S_OR_B32 $exec_lo, %mask, implicit-def $scc
+    S_BRANCH %bb.5
+
+  bb.3: ; loop
+    successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+
+    %res1:vgpr_32 = V_ADD_F32_e64 0, %res1, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %res2:vgpr_32 = V_ADD_F32_e64 0, %res2, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %res3:vgpr_32 = V_ADD_F32_e64 0, %res3, 0, %3, 0, 0, implicit $mode, implicit $exec
+    %res4:vgpr_32 = V_ADD_F32_e64 0, %res4, 0, %4, 0, 0, implicit $mode, implicit $exec
+    %res5:vgpr_32 = V_ADD_F32_e64 0, %res5, 0, %5, 0, 0, implicit $mode, implicit $exec
+    %res6:vgpr_32 = V_ADD_F32_e64 0, %res6, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %res7:vgpr_32 = V_ADD_F32_e64 0, %res7, 0, %7, 0, 0, implicit $mode, implicit $exec
+    %res8:vgpr_32 = V_ADD_F32_e64 0, %res8, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %res9:vgpr_32 = V_ADD_F32_e64 0, %res9, 0, %9, 0, 0, implicit $mode, implicit $exec
+    %res10:vgpr_32 = V_ADD_F32_e64 0, %res10, 0, %10, 0, 0, implicit $mode, implicit $exec
+    %res11:vgpr_32 = V_ADD_F32_e64 0, %res11, 0, %11, 0, 0, implicit $mode, implicit $exec
+    %res12:vgpr_32 = V_ADD_F32_e64 0, %res12, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %count:sgpr_32 = nuw nsw S_ADD_I32 %count, 1, implicit-def dead $scc
+    %vcmp2:sreg_32 = V_CMP_GE_I32_e64 %count, %loop_end, implicit $exec
+    %mask2:sgpr_32 = S_OR_B32 %vcmp2, %mask2, implicit-def $scc
+    $exec_lo = S_ANDN2_B32_term $exec_lo, %mask2, implicit-def $scc
+    S_CBRANCH_EXECNZ %bb.3, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4: ; flow
+    successors: %bb.2(0x80000000)
+
+    $exec_lo = S_OR_B32 $exec_lo, %mask2, implicit-def $scc
+    S_BRANCH %bb.2
+
+  bb.5: ; exit
+    EXP_DONE 0, %res1, %res2, %res3, %res4, -1, 0, 15, implicit $exec
+    EXP_DONE 0, %res5, %res6, %res7, %res8, -1, 0, 15, implicit $exec
+    EXP_DONE 0, %res9, %res10, %res11, %res12, -1, 0, 15, implicit $exec
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir
index b705506af04546..82140c81568486 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn -run-pass none -o - %s | FileCheck %s
 
 ---
-name: target_memoperands
+name: target_memoperands_noclobber
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
@@ -12,3 +12,15 @@ body: |
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: ("amdgpu-noclobber" load (s32))
 ...
+
+---
+name: target_memoperands_last_use
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: target_memoperands
+    ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: ("amdgpu-last-use" load (s32))
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(s32) = G_LOAD %0 :: ("amdgpu-last-use" load (s32))
+...


        


More information about the llvm-commits mailing list