[llvm] [AMDGPU][WIP] Optimize SGPR spills (PR #93668)

Vikash Gupta via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 17 13:32:18 PDT 2024


https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/93668

>From a3b22f97f22f0d80de8d7b81d04561ad4f82d7de Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Mon, 27 May 2024 17:44:27 +0530
Subject: [PATCH 1/9] [WIP] Implemented a patch to optimize SGPR spills.

Introduced the StackSlotColoring pass after SGPR RegAlloc and Spill to optimize stack slots reusage.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 +
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |  20 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   8 ++
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |  72 +++++-----
 .../CodeGen/AMDGPU/sgpr-regalloc-flags.ll     |   8 ++
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |  84 ++++++------
 ...er-sgpr-alloc-equal-size-stack-objects.mir | 127 ++++++++++++++++++
 ...gpr-alloc-unequal-size-stack-objects-2.mir | 122 +++++++++++++++++
 ...-sgpr-alloc-unequal-size-stack-objects.mir | 123 +++++++++++++++++
 10 files changed, 490 insertions(+), 83 deletions(-)
 create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
 create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
 create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce997c659094a..666b5452fa15c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1421,6 +1421,9 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // since FastRegAlloc does the replacements itself.
   addPass(createVirtRegRewriter(false));
 
+  // Optimizes SGPR spills into VGPR lanes for non-interferring spill-ranges.
+  addPass(&StackSlotColoringID);
+
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
   addPass(&SIPreAllocateWWMRegsID);
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index b6a0152f6fa83..9e121b47ad3fb 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -52,7 +52,8 @@ class SILowerSGPRSpills : public MachineFunctionPass {
   void calculateSaveRestoreBlocks(MachineFunction &MF);
   bool spillCalleeSavedRegs(MachineFunction &MF,
                             SmallVectorImpl<int> &CalleeSavedFIs);
-  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
+  void extendWWMVirtRegLiveness(MachineFunction &MF, SlotIndexes *Indexes,
+                                LiveIntervals *LIS);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -260,6 +261,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
 }
 
 void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
+                                                 SlotIndexes *Indexes,
                                                  LiveIntervals *LIS) {
   // TODO: This is a workaround to avoid the unmodelled liveness computed with
   // whole-wave virtual registers when allocated together with the regular VGPR
@@ -278,14 +280,21 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
   for (auto Reg : MFI->getSGPRSpillVGPRs()) {
     for (MachineBasicBlock *SaveBlock : SaveBlocks) {
       MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+      MachineInstrSpan MIS(InsertBefore, SaveBlock);
+
       DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
       auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
                          TII->get(AMDGPU::IMPLICIT_DEF), Reg);
       MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
       // Set SGPR_SPILL asm printer flag
       MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
+
       if (LIS) {
         LIS->InsertMachineInstrInMaps(*MIB);
+      } else if (Indexes) {
+        assert(std::distance(MIS.begin(), InsertBefore) == 1);
+        MachineInstr &Inst = *std::prev(InsertBefore);
+        Indexes->insertMachineInstrInMaps(Inst);
       }
     }
   }
@@ -300,8 +309,13 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
       auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
                          TII->get(TargetOpcode::KILL));
       MIB.addReg(Reg);
-      if (LIS)
+
+      if (LIS) {
         LIS->InsertMachineInstrInMaps(*MIB);
+      } else if (Indexes) {
+        MachineInstr &Inst = *std::prev(InsertBefore);
+        Indexes->insertMachineInstrInMaps(Inst);
+      }
     }
   }
 }
@@ -392,7 +406,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     }
 
     if (SpilledToVirtVGPRLanes) {
-      extendWWMVirtRegLiveness(MF, LIS);
+      extendWWMVirtRegLiveness(MF, Indexes, LIS);
       if (LIS) {
         // Compute the LiveInterval for the newly created virtual registers.
         for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4c5e60c873bb9..0442402bfa6b6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,8 +1775,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
 
   if (SpillToVGPR) {
 
-    assert(SB.NumSubRegs == VGPRSpills.size() &&
-           "Num of VGPR lanes should be equal to num of SGPRs spilled");
+    assert(SB.NumSubRegs <= VGPRSpills.size() &&
+           "Num of VGPR lanes should be greater or equal to num of SGPRs "
+           "spilled, as Stack Slot Coloring pass assigns different SGPR spills "
+           "into same stack slots");
 
     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
       Register SubReg =
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 08cf83fd2bd0f..85d20d1c0410c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -366,10 +366,12 @@
 ; GCN-O1-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        Virtual Register Rewriter
+; GCN-O1-NEXT:        Stack Slot Coloring
 ; GCN-O1-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-NEXT:        Virtual Register Map
 ; GCN-O1-NEXT:        Live Register Matrix
 ; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O1-NEXT:        Live Stack Slot Analysis
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        SI Lower WWM Copies
 ; GCN-O1-NEXT:        GCN NSA Reassign
@@ -671,10 +673,12 @@
 ; GCN-O1-OPTS-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
 ; GCN-O1-OPTS-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-OPTS-NEXT:        Virtual Register Map
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
 ; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O1-OPTS-NEXT:        Live Stack Slot Analysis
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
@@ -982,10 +986,12 @@
 ; GCN-O2-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        Virtual Register Rewriter
+; GCN-O2-NEXT:        Stack Slot Coloring
 ; GCN-O2-NEXT:        SI lower SGPR spill instructions
 ; GCN-O2-NEXT:        Virtual Register Map
 ; GCN-O2-NEXT:        Live Register Matrix
 ; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O2-NEXT:        Live Stack Slot Analysis
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        SI Lower WWM Copies
 ; GCN-O2-NEXT:        GCN NSA Reassign
@@ -1305,10 +1311,12 @@
 ; GCN-O3-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        Virtual Register Rewriter
+; GCN-O3-NEXT:        Stack Slot Coloring
 ; GCN-O3-NEXT:        SI lower SGPR spill instructions
 ; GCN-O3-NEXT:        Virtual Register Map
 ; GCN-O3-NEXT:        Live Register Matrix
 ; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O3-NEXT:        Live Stack Slot Analysis
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        SI Lower WWM Copies
 ; GCN-O3-NEXT:        GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index fbe34a3a3970b..25e9e09748c81 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    ; def s29
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX906-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX906-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX906-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX906-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX906-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX906-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX906-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX906-NEXT:    v_writelane_b32 v40, s29, 32
+; GFX906-NEXT:    v_writelane_b32 v40, s21, 12
+; GFX906-NEXT:    v_writelane_b32 v40, s22, 13
+; GFX906-NEXT:    v_writelane_b32 v40, s23, 14
+; GFX906-NEXT:    v_writelane_b32 v40, s24, 15
+; GFX906-NEXT:    v_writelane_b32 v40, s25, 16
+; GFX906-NEXT:    v_writelane_b32 v40, s26, 17
+; GFX906-NEXT:    v_writelane_b32 v40, s27, 18
+; GFX906-NEXT:    v_writelane_b32 v40, s28, 19
+; GFX906-NEXT:    v_writelane_b32 v40, s29, 20
 ; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
 ; GFX906-NEXT:    v_readlane_b32 s6, v40, 0
 ; GFX906-NEXT:    v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s21, v40, 24
+; GFX906-NEXT:    v_readlane_b32 s21, v40, 12
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s21
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s22, v40, 25
+; GFX906-NEXT:    v_readlane_b32 s22, v40, 13
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s22
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s23, v40, 26
+; GFX906-NEXT:    v_readlane_b32 s23, v40, 14
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s23
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s24, v40, 27
+; GFX906-NEXT:    v_readlane_b32 s24, v40, 15
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s24
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s25, v40, 28
+; GFX906-NEXT:    v_readlane_b32 s25, v40, 16
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s25
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s26, v40, 29
+; GFX906-NEXT:    v_readlane_b32 s26, v40, 17
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s26
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s27, v40, 30
+; GFX906-NEXT:    v_readlane_b32 s27, v40, 18
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s27
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s28, v40, 31
+; GFX906-NEXT:    v_readlane_b32 s28, v40, 19
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s28
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s29, v40, 32
+; GFX906-NEXT:    v_readlane_b32 s29, v40, 20
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s29
 ; GFX906-NEXT:    ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    ; def s29
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX908-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX908-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX908-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX908-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX908-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX908-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX908-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX908-NEXT:    v_writelane_b32 v40, s29, 32
+; GFX908-NEXT:    v_writelane_b32 v40, s21, 12
+; GFX908-NEXT:    v_writelane_b32 v40, s22, 13
+; GFX908-NEXT:    v_writelane_b32 v40, s23, 14
+; GFX908-NEXT:    v_writelane_b32 v40, s24, 15
+; GFX908-NEXT:    v_writelane_b32 v40, s25, 16
+; GFX908-NEXT:    v_writelane_b32 v40, s26, 17
+; GFX908-NEXT:    v_writelane_b32 v40, s27, 18
+; GFX908-NEXT:    v_writelane_b32 v40, s28, 19
+; GFX908-NEXT:    v_writelane_b32 v40, s29, 20
 ; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
 ; GFX908-NEXT:    v_readlane_b32 s6, v40, 0
 ; GFX908-NEXT:    v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s21, v40, 24
+; GFX908-NEXT:    v_readlane_b32 s21, v40, 12
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s21
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s22, v40, 25
+; GFX908-NEXT:    v_readlane_b32 s22, v40, 13
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s22
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s23, v40, 26
+; GFX908-NEXT:    v_readlane_b32 s23, v40, 14
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s23
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s24, v40, 27
+; GFX908-NEXT:    v_readlane_b32 s24, v40, 15
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s24
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s25, v40, 28
+; GFX908-NEXT:    v_readlane_b32 s25, v40, 16
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s25
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s26, v40, 29
+; GFX908-NEXT:    v_readlane_b32 s26, v40, 17
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s26
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s27, v40, 30
+; GFX908-NEXT:    v_readlane_b32 s27, v40, 18
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s27
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s28, v40, 31
+; GFX908-NEXT:    v_readlane_b32 s28, v40, 19
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s28
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s29, v40, 32
+; GFX908-NEXT:    v_readlane_b32 s29, v40, 20
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s29
 ; GFX908-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 17a19116735e4..04a9f3cb2d332 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -17,10 +17,12 @@
 
 ; DEFAULT: Greedy Register Allocator
 ; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: Stack Slot Coloring
 ; DEFAULT-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-NEXT: Virtual Register Map
 ; DEFAULT-NEXT: Live Register Matrix
 ; DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-NEXT: Live Stack Slot Analysis
 ; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: SI Lower WWM Copies
 ; DEFAULT-NEXT: GCN NSA Reassign
@@ -50,10 +52,12 @@
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
 ; BASIC-DEFAULT-NEXT: Basic Register Allocator
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: Stack Slot Coloring
 ; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
 ; BASIC-DEFAULT-NEXT: Virtual Register Map
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
 ; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
 ; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
 ; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
 ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -69,10 +73,12 @@
 
 ; DEFAULT-BASIC: Greedy Register Allocator
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: Stack Slot Coloring
 ; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-BASIC-NEXT: Virtual Register Map
 ; DEFAULT-BASIC-NEXT: Live Register Matrix
 ; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -90,10 +96,12 @@
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: Stack Slot Coloring
 ; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
 ; BASIC-BASIC-NEXT: Virtual Register Map
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
+; BASIC-BASIC-NEXT: Live Stack Slot Analysis
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: SI Lower WWM Copies
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index bea2e6d4b45a3..e1bd1523d78a4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10098,7 +10098,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT:    s_mov_b32 s2, 0x84400
+; GFX6-NEXT:    s_mov_b32 s2, 0x86a00
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10108,7 +10108,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x84000
+; GFX6-NEXT:    s_mov_b32 s2, 0x86600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10117,7 +10117,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x86200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10126,7 +10126,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x85e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10135,7 +10135,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x85a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10144,7 +10144,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10153,7 +10153,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10162,7 +10162,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x84e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10171,7 +10171,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x84a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10180,7 +10180,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x84600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10189,7 +10189,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10198,7 +10198,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10208,7 +10208,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10217,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10239,7 +10239,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s0, 0x81800
+; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10266,7 +10266,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s2, 0x84800
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10307,7 +10307,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x85000
+; GFX6-NEXT:    s_mov_b32 s34, 0x81400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10316,7 +10316,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x84800
+; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10343,7 +10343,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x85800
+; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10352,7 +10352,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x85000
+; GFX6-NEXT:    s_mov_b32 s34, 0x81400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10379,7 +10379,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x86000
+; GFX6-NEXT:    s_mov_b32 s34, 0x82400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10388,7 +10388,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x85800
+; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10411,7 +10411,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s34, 0x86800
+; GFX6-NEXT:    s_mov_b32 s34, 0x82c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10423,7 +10423,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    s_mov_b32 s2, 0x86c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10432,7 +10432,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
+; GFX6-NEXT:    s_mov_b32 s36, 0x82400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10450,7 +10450,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s44, 0x86800
+; GFX6-NEXT:    s_mov_b32 s44, 0x82c00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10464,7 +10464,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x20c0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10521,13 +10521,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s0, 0x84400
+; GFX6-NEXT:    s_mov_b32 s0, 0x86a00
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, 0x84000
+; GFX6-NEXT:    s_mov_b32 s0, 0x86600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10535,7 +10535,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x86200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10543,7 +10543,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83800
+; GFX6-NEXT:    s_mov_b32 s0, 0x85e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10551,7 +10551,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83400
+; GFX6-NEXT:    s_mov_b32 s0, 0x85a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10559,7 +10559,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83000
+; GFX6-NEXT:    s_mov_b32 s0, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10567,7 +10567,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10575,7 +10575,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x82800
+; GFX6-NEXT:    s_mov_b32 s0, 0x84e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10583,7 +10583,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x82400
+; GFX6-NEXT:    s_mov_b32 s0, 0x84a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10591,7 +10591,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x82000
+; GFX6-NEXT:    s_mov_b32 s0, 0x84600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10599,7 +10599,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x84200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10607,7 +10607,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x81400
+; GFX6-NEXT:    s_mov_b32 s0, 0x83a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10615,7 +10615,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x81800
+; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10623,7 +10623,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x81000
+; GFX6-NEXT:    s_mov_b32 s0, 0x83600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10631,7 +10631,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x83200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
new file mode 100755
index 0000000000000..e8651cd6944d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
@@ -0,0 +1,127 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  declare void @func(i32)
+...
+
+
+---
+name:            stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+    ; SHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+    ; NOSHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_64 = COPY $sgpr0_sgpr1
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr0_sgpr1 = COPY %5
+    %6:sreg_64 = COPY $sgpr2_sgpr3
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2_sgpr3 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
new file mode 100755
index 0000000000000..f20dee490a83c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  declare void @func(i32)
+...
+
+---
+name:            stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 3
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_32 = COPY $sgpr0
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr0 = COPY %5
+    %6:sreg_64 = COPY $sgpr2_sgpr3
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2_sgpr3 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
new file mode 100755
index 0000000000000..e2f1d3fd0a0af
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
@@ -0,0 +1,123 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  declare void @func(i32)
+...
+
+
+---
+name:            stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_64 = COPY $sgpr2_sgpr3
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr2_sgpr3 = COPY %5
+    %6:sreg_32 = COPY $sgpr2
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...

>From 80639e5e9127b9edbc0bbb6c9f630b7a9834ed2a Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Wed, 29 May 2024 16:17:02 +0530
Subject: [PATCH 2/9] Added configuration flag for StackSlotColoring &
 refactored comments.

---
 llvm/include/llvm/CodeGen/Passes.h            |  2 ++
 llvm/lib/CodeGen/StackSlotColoring.cpp        | 12 ++++++++++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  7 +++++--
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  | 20 ++++---------------
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index f850767270a4f..172583bc76fa8 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -371,6 +371,8 @@ namespace llvm {
 
   /// StackSlotColoring - This pass performs stack slot coloring.
   extern char &StackSlotColoringID;
+  FunctionPass *
+  createStackSlotColoring(bool preserveRegAllocNeededAnalysis = false);
 
   /// This pass lays out funclets contiguously.
   extern char &FuncletLayoutID;
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index eb7a113b575f7..4f9b3515bc733 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -67,6 +67,9 @@ namespace {
     const MachineBlockFrequencyInfo *MBFI = nullptr;
     SlotIndexes *Indexes = nullptr;
 
+    // - preserves Analysis passes in case RA may be called afterwards.
+    bool preserveRegAllocNeededAnalysis = false;
+
     // SSIntervals - Spill slot intervals.
     std::vector<LiveInterval*> SSIntervals;
 
@@ -142,7 +145,9 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    StackSlotColoring() : MachineFunctionPass(ID) {
+    StackSlotColoring(bool preserveRegAllocNeededAnalysis_ = false)
+        : MachineFunctionPass(ID),
+          preserveRegAllocNeededAnalysis(preserveRegAllocNeededAnalysis_) {
       initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
     }
 
@@ -563,3 +568,8 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+FunctionPass *
+llvm::createStackSlotColoring(bool preserveRegAllocNeededAnalysis) {
+  return new StackSlotColoring(preserveRegAllocNeededAnalysis);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 666b5452fa15c..deabd7a4175ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1421,8 +1421,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // since FastRegAlloc does the replacements itself.
   addPass(createVirtRegRewriter(false));
 
-  // Optimizes SGPR spills into VGPR lanes for non-interferring spill-ranges.
-  addPass(&StackSlotColoringID);
+  // As by this point SGPR's RA is done introducing SGPR spills to stack frame
+  // through SGPRAllocPass. So, invoking StackSlotColoring here, may allow these
+  // SGPR spills to re-use stack slots, before these spills is further lowered
+  // down via SILowerSGPRSpills(i.e. equivalent of PEI for SGPRs).
+  addPass(createStackSlotColoring(/*preserveRegAllocNeededAnalysis*/ true));
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9e121b47ad3fb..fa3aa9d7091b7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -52,8 +52,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
   void calculateSaveRestoreBlocks(MachineFunction &MF);
   bool spillCalleeSavedRegs(MachineFunction &MF,
                             SmallVectorImpl<int> &CalleeSavedFIs);
-  void extendWWMVirtRegLiveness(MachineFunction &MF, SlotIndexes *Indexes,
-                                LiveIntervals *LIS);
+  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -261,7 +260,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
 }
 
 void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
-                                                 SlotIndexes *Indexes,
                                                  LiveIntervals *LIS) {
   // TODO: This is a workaround to avoid the unmodelled liveness computed with
   // whole-wave virtual registers when allocated together with the regular VGPR
@@ -280,7 +278,6 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
   for (auto Reg : MFI->getSGPRSpillVGPRs()) {
     for (MachineBasicBlock *SaveBlock : SaveBlocks) {
       MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
-      MachineInstrSpan MIS(InsertBefore, SaveBlock);
 
       DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
       auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
@@ -289,13 +286,8 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
       // Set SGPR_SPILL asm printer flag
       MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
 
-      if (LIS) {
+      if (LIS)
         LIS->InsertMachineInstrInMaps(*MIB);
-      } else if (Indexes) {
-        assert(std::distance(MIS.begin(), InsertBefore) == 1);
-        MachineInstr &Inst = *std::prev(InsertBefore);
-        Indexes->insertMachineInstrInMaps(Inst);
-      }
     }
   }
 
@@ -310,12 +302,8 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
                          TII->get(TargetOpcode::KILL));
       MIB.addReg(Reg);
 
-      if (LIS) {
+      if (LIS)
         LIS->InsertMachineInstrInMaps(*MIB);
-      } else if (Indexes) {
-        MachineInstr &Inst = *std::prev(InsertBefore);
-        Indexes->insertMachineInstrInMaps(Inst);
-      }
     }
   }
 }
@@ -406,7 +394,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     }
 
     if (SpilledToVirtVGPRLanes) {
-      extendWWMVirtRegLiveness(MF, Indexes, LIS);
+      extendWWMVirtRegLiveness(MF, LIS);
       if (LIS) {
         // Compute the LiveInterval for the newly created virtual registers.
         for (auto Reg : FuncInfo->getSGPRSpillVGPRs())

>From 83c7ac812e253c061edb3c0cace26ca631d79846 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Thu, 30 May 2024 13:52:15 +0530
Subject: [PATCH 3/9] Made the following suggested changes.

Raised another PR itself to deal with generic changes in
StackSlotColoring pass needed to resolve current ticket.

Merged multiple mir test cases into one to be concise.
---
 llvm/include/llvm/CodeGen/Passes.h            |   4 +-
 llvm/lib/CodeGen/StackSlotColoring.cpp        |  14 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +-
 ...er-sgpr-alloc-equal-size-stack-objects.mir | 127 -------
 ...gpr-alloc-unequal-size-stack-objects-2.mir | 122 ------
 ...-sgpr-alloc-unequal-size-stack-objects.mir | 123 ------
 .../stack-slot-color-after-sgpr-regalloc.mir  | 353 ++++++++++++++++++
 7 files changed, 356 insertions(+), 389 deletions(-)
 delete mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
 delete mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
 delete mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 172583bc76fa8..0e0aa8a7f926d 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -283,7 +283,7 @@ namespace llvm {
   /// critical-path and resource depth.
   extern char &MachineCombinerID;
 
-  /// StackSlotColoring - This pass performs stack coloring and merging.
+  /// StackColoring - This pass performs stack coloring and merging.
   /// It merges disjoint allocas to reduce the stack size.
   extern char &StackColoringID;
 
@@ -371,8 +371,6 @@ namespace llvm {
 
   /// StackSlotColoring - This pass performs stack slot coloring.
   extern char &StackSlotColoringID;
-  FunctionPass *
-  createStackSlotColoring(bool preserveRegAllocNeededAnalysis = false);
 
   /// This pass lays out funclets contiguously.
   extern char &FuncletLayoutID;
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 4f9b3515bc733..2624d82f93fba 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -65,10 +65,6 @@ namespace {
     MachineFrameInfo *MFI = nullptr;
     const TargetInstrInfo *TII = nullptr;
     const MachineBlockFrequencyInfo *MBFI = nullptr;
-    SlotIndexes *Indexes = nullptr;
-
-    // - preserves Analysis passes in case RA may be called afterwards.
-    bool preserveRegAllocNeededAnalysis = false;
 
     // SSIntervals - Spill slot intervals.
     std::vector<LiveInterval*> SSIntervals;
@@ -145,9 +141,7 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    StackSlotColoring(bool preserveRegAllocNeededAnalysis_ = false)
-        : MachineFunctionPass(ID),
-          preserveRegAllocNeededAnalysis(preserveRegAllocNeededAnalysis_) {
+    StackSlotColoring() : MachineFunctionPass(ID) {
       initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
     }
 
@@ -533,7 +527,6 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
-  Indexes = &getAnalysis<SlotIndexes>();
 
   bool Changed = false;
 
@@ -567,9 +560,4 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   Assignments.clear();
 
   return Changed;
-}
-
-FunctionPass *
-llvm::createStackSlotColoring(bool preserveRegAllocNeededAnalysis) {
-  return new StackSlotColoring(preserveRegAllocNeededAnalysis);
 }
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index deabd7a4175ae..eac6f639a6cf3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1425,7 +1425,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // through SGPRAllocPass. So, invoking StackSlotColoring here, may allow these
   // SGPR spills to re-use stack slots, before these spills is further lowered
   // down via SILowerSGPRSpills(i.e. equivalent of PEI for SGPRs).
-  addPass(createStackSlotColoring(/*preserveRegAllocNeededAnalysis*/ true));
+  addPass(&StackSlotColoringID);
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
deleted file mode 100755
index e8651cd6944d1..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
+++ /dev/null
@@ -1,127 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
-
---- |
-
-  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  declare void @func(i32)
-...
-
-
----
-name:            stack-slot-color-after-sgpr-alloc
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
-    ; SHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
-    ; NOSHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_64 = COPY $sgpr0_sgpr1
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr0_sgpr1 = COPY %5
-    %6:sreg_64 = COPY $sgpr2_sgpr3
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2_sgpr3 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
deleted file mode 100755
index f20dee490a83c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
+++ /dev/null
@@ -1,122 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
-
---- |
-
-  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  declare void @func(i32)
-...
-
----
-name:            stack-slot-color-after-sgpr-alloc
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 3
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_32 = COPY $sgpr0
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr0 = COPY %5
-    %6:sreg_64 = COPY $sgpr2_sgpr3
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2_sgpr3 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
deleted file mode 100755
index e2f1d3fd0a0af..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
+++ /dev/null
@@ -1,123 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
-
---- |
-
-  define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  declare void @func(i32)
-...
-
-
----
-name:            stack-slot-color-after-sgpr-alloc
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_64 = COPY $sgpr2_sgpr3
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr2_sgpr3 = COPY %5
-    %6:sreg_32 = COPY $sgpr2
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
new file mode 100644
index 0000000000000..cba97b0572fdf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+  define void @stack-slot-color-after-sgpr-alloc-1(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  define void @stack-slot-color-after-sgpr-alloc-2(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  define void @stack-slot-color-after-sgpr-alloc-3(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+  bb:
+    %tmp = load i32, ptr addrspace(1) null, align 4
+    call void @func(i32 undef)
+    call void @func(i32 %tmp)
+    unreachable
+  }
+
+  declare void @func(i32)
+...
+
+---
+name:            stack-slot-color-after-sgpr-alloc-1
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+    ; SHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+    ; NOSHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_64 = COPY $sgpr0_sgpr1
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr0_sgpr1 = COPY %5
+    %6:sreg_64 = COPY $sgpr2_sgpr3
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2_sgpr3 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
+
+---
+name:            stack-slot-color-after-sgpr-alloc-2
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_64 = COPY $sgpr2_sgpr3
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr2_sgpr3 = COPY %5
+    %6:sreg_32 = COPY $sgpr2
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
+
+---
+name:            stack-slot-color-after-sgpr-alloc-3
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg: $sgpr32
+  stackPtrOffsetReg: $sgpr32
+body:             |
+  bb.0:
+    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
+    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SHARE-NEXT: {{  $}}
+    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
+    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+    ; NOSHARE-NEXT: {{  $}}
+    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 3
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, killed $vgpr1
+    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
+    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3
+    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5, implicit-def $sgpr2_sgpr3
+    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    %0:sreg_32_xm0 = COPY $sgpr32
+    %5:sreg_32 = COPY $sgpr0
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+    $sgpr32 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr32
+    $sgpr0 = COPY %5
+    %6:sreg_64 = COPY $sgpr2_sgpr3
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+    $sgpr32 = COPY %4
+    $sgpr2_sgpr3 = COPY %6
+    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...

>From 55b5c0fa2c465094e8e4fb1b63e1956fa07f6ad7 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Mon, 3 Jun 2024 16:54:23 +0530
Subject: [PATCH 4/9] Made changes to get rid of generic changes & addressed
 the required changes in comments and test files.

---
 llvm/include/llvm/CodeGen/Passes.h            |   2 +-
 llvm/lib/CodeGen/StackSlotColoring.cpp        |   1 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   7 +-
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |   3 -
 .../stack-slot-color-after-sgpr-regalloc.mir  | 476 +++++++++---------
 5 files changed, 255 insertions(+), 234 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 0e0aa8a7f926d..f850767270a4f 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -283,7 +283,7 @@ namespace llvm {
   /// critical-path and resource depth.
   extern char &MachineCombinerID;
 
-  /// StackColoring - This pass performs stack coloring and merging.
+  /// StackSlotColoring - This pass performs stack coloring and merging.
   /// It merges disjoint allocas to reduce the stack size.
   extern char &StackColoringID;
 
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 2624d82f93fba..fbfc3ae45321c 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -13,7 +13,6 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveIntervals.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eac6f639a6cf3..a738ba4da046b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1421,10 +1421,9 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // since FastRegAlloc does the replacements itself.
   addPass(createVirtRegRewriter(false));
 
-  // As by this point SGPR's RA is done introducing SGPR spills to stack frame
-  // through SGPRAllocPass. So, invoking StackSlotColoring here, may allow these
-  // SGPR spills to re-use stack slots, before these spills is further lowered
-  // down via SILowerSGPRSpills(i.e. equivalent of PEI for SGPRs).
+  // At this point, the sgpr-regalloc has been done and it is good to have the
+  // stack slot coloring to try to optimize the SGPR spill stack indices before
+  // attempting the custom SGPR spill lowering.
   addPass(&StackSlotColoringID);
 
   // Equivalent of PEI for SGPRs.
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index fa3aa9d7091b7..e379d24d48a21 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -278,14 +278,12 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
   for (auto Reg : MFI->getSGPRSpillVGPRs()) {
     for (MachineBasicBlock *SaveBlock : SaveBlocks) {
       MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
-
       DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
       auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
                          TII->get(AMDGPU::IMPLICIT_DEF), Reg);
       MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
       // Set SGPR_SPILL asm printer flag
       MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
-
       if (LIS)
         LIS->InsertMachineInstrInMaps(*MIB);
     }
@@ -301,7 +299,6 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
       auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
                          TII->get(TargetOpcode::KILL));
       MIB.addReg(Reg);
-
       if (LIS)
         LIS->InsertMachineInstrInMaps(*MIB);
     }
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
index cba97b0572fdf..7cd893efe0b79 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
@@ -1,6 +1,13 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=stack-slot-coloring -o - %s | FileCheck -check-prefix=BEFORE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefix=AFTER %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SGPR_SPILLED %s
+
+# This file contains 3 test cases to observe the optimal stack slot usage for SGPR spills utilizing Stack Slot Coloring pass.
+# @stack-slot-color-after-sgpr-alloc-1 : In this, the stack slot indices is shared among the spill stack objects of equal size.
+# @stack-slot-color-after-sgpr-alloc-2 AND @stack-slot-color-after-sgpr-alloc-3 :
+# In the remaining 2 test cases mentioned in above line, the stack slot indices is shared among the spill stack objects of 
+# unequal size, with spill slot having the size of the largest of the stack objects sharing the common stack indices.
 
 --- |
   define void @stack-slot-color-after-sgpr-alloc-1(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
@@ -42,84 +49,89 @@ machineFunctionInfo:
   stackPtrOffsetReg: $sgpr32
 body:             |
   bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
-    ; SHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
+    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-1
+    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
-    ; NOSHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-1
+    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED-NEXT: {{  $}}
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+    ; SGPR_SPILLED-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr0_sgpr1
     %1:vreg_64 = IMPLICIT_DEF
@@ -152,80 +164,87 @@ machineFunctionInfo:
   stackPtrOffsetReg: $sgpr32
 body:             |
   bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
+    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr2, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr2 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-2
+    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S32_SAVE $sgpr2, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr2 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-2
+    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED-NEXT: {{  $}}
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr2_sgpr3
     %1:vreg_64 = IMPLICIT_DEF
@@ -258,80 +277,87 @@ machineFunctionInfo:
   stackPtrOffsetReg: $sgpr32
 body:             |
   bb.0:
-    ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
-    ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SHARE-NEXT: {{  $}}
-    ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SHARE-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
-    ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
+    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
+    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
+    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
+    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ;
+    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-3
+    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S32_SAVE $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
+    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
+    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
+    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
+    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ;
-    ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
-    ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
-    ; NOSHARE-NEXT: {{  $}}
-    ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
-    ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
-    ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 3
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, killed $vgpr1
-    ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
-    ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3
-    ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
-    ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5, implicit-def $sgpr2_sgpr3
-    ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
-    ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-3
+    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED-NEXT: {{  $}}
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+    ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_32 = COPY $sgpr0
     %1:vreg_64 = IMPLICIT_DEF

>From 6f2ada68047677a1378390be6e729d2ca6c9fce5 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Wed, 5 Jun 2024 11:18:45 +0530
Subject: [PATCH 5/9] Remove unwanted changes not needed in this PR, and
 refactored some comments.

---
 llvm/lib/CodeGen/StackSlotColoring.cpp       | 2 +-
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 3 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp    | 9 ++++++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index fbfc3ae45321c..946d12414a98f 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -559,4 +559,4 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   Assignments.clear();
 
   return Changed;
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index e379d24d48a21..b6a0152f6fa83 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -284,8 +284,9 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
       MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
       // Set SGPR_SPILL asm printer flag
       MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
-      if (LIS)
+      if (LIS) {
         LIS->InsertMachineInstrInMaps(*MIB);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0442402bfa6b6..5650bf01bb93e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,10 +1775,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
 
   if (SpillToVGPR) {
 
+    // Since stack slot coloring pass is trying to optimize SGPR spills,
+    // VGPR lanes (mapped from spill stack slot) may be shared for unequal SGPR
+    // spills. This accounts for number of VGPR lanes alloted equal to the
+    // largest SGPR being spilled in them.
     assert(SB.NumSubRegs <= VGPRSpills.size() &&
-           "Num of VGPR lanes should be greater or equal to num of SGPRs "
-           "spilled, as Stack Slot Coloring pass assigns different SGPR spills "
-           "into same stack slots");
+           "Num of VGPR lanes mapped should be greater or equal to num of "
+           "SGPRs spilled");
 
     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
       Register SubReg =

>From 5eb4cb83c06717a5478eb88c37062ef18e29c319 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Wed, 5 Jun 2024 15:23:23 +0530
Subject: [PATCH 6/9] Made comment changes in SIRegisterInfo.cpp.

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5650bf01bb93e..b4f1aaaa1705d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1776,12 +1776,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
   if (SpillToVGPR) {
 
     // Since stack slot coloring pass is trying to optimize SGPR spills,
-    // VGPR lanes (mapped from spill stack slot) may be shared for unequal SGPR
-    // spills. This accounts for number of VGPR lanes alloted equal to the
-    // largest SGPR being spilled in them.
+    // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
+    // spills of different sizes. This accounts for number of VGPR lanes alloted
+    // equal to the largest SGPR being spilled in them.
     assert(SB.NumSubRegs <= VGPRSpills.size() &&
-           "Num of VGPR lanes mapped should be greater or equal to num of "
-           "SGPRs spilled");
+           "Num of SGPRs spilled should be less than or equal to num of "
+           "the VGPR lanes.");
 
     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
       Register SubReg =

>From 24721cbcd6b4224e1ea7c5c2a172307da99080a9 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Thu, 30 May 2024 13:00:08 +0530
Subject: [PATCH 7/9] [CodeGen] Preserved additional analyses in
 StackSlotColoring pass.

The pass pipeline of some architecture splits register allocation
phase based on different register classes. As some analyses need to
be computed at the beginning of the register allocation and kept
alive till all values are assigned to some physical registers.

This poses challenge with objective of introducing StackSlotColoring
after partial virtual registers are assigned to physical registers,
in order to optimize stack slots usage.As this pass doesn't preserve
few analysis yet to be needed by the register allocation of the
remaining virtual registers, necessiating them to be kept preserved.
---
 llvm/lib/CodeGen/StackSlotColoring.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 946d12414a98f..eb7a113b575f7 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -64,6 +65,7 @@ namespace {
     MachineFrameInfo *MFI = nullptr;
     const TargetInstrInfo *TII = nullptr;
     const MachineBlockFrequencyInfo *MBFI = nullptr;
+    SlotIndexes *Indexes = nullptr;
 
     // SSIntervals - Spill slot intervals.
     std::vector<LiveInterval*> SSIntervals;
@@ -526,6 +528,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  Indexes = &getAnalysis<SlotIndexes>();
 
   bool Changed = false;
 

>From d3edd1e2ca6aedab510ce6aa6389dbe1c4f44914 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Thu, 6 Jun 2024 12:36:24 +0530
Subject: [PATCH 8/9] Updated the  MIR LIT showing the VGPR lane usage during
 SGPR spills.

---
 .../stack-slot-color-after-sgpr-regalloc.mir  | 379 ------------------
 1 file changed, 379 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir

diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
deleted file mode 100644
index 7cd893efe0b79..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir
+++ /dev/null
@@ -1,379 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=stack-slot-coloring -o - %s | FileCheck -check-prefix=BEFORE %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefix=AFTER %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SGPR_SPILLED %s
-
-# This file contains 3 test cases to observe the optimal stack slot usage for SGPR spills utilizing Stack Slot Coloring pass.
-# @stack-slot-color-after-sgpr-alloc-1 : In this, the stack slot indices is shared among the spill stack objects of equal size.
-# @stack-slot-color-after-sgpr-alloc-2 AND @stack-slot-color-after-sgpr-alloc-3 :
-# In the remaining 2 test cases mentioned in above line, the stack slot indices is shared among the spill stack objects of 
-# unequal size, with spill slot having the size of the largest of the stack objects sharing the common stack indices.
-
---- |
-  define void @stack-slot-color-after-sgpr-alloc-1(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  define void @stack-slot-color-after-sgpr-alloc-2(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  define void @stack-slot-color-after-sgpr-alloc-3(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
-  bb:
-    %tmp = load i32, ptr addrspace(1) null, align 4
-    call void @func(i32 undef)
-    call void @func(i32 %tmp)
-    unreachable
-  }
-
-  declare void @func(i32)
-...
-
----
-name:            stack-slot-color-after-sgpr-alloc-1
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-1
-    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-1
-    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-1
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
-    ; SGPR_SPILLED-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_64 = COPY $sgpr0_sgpr1
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr0_sgpr1 = COPY %5
-    %6:sreg_64 = COPY $sgpr2_sgpr3
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2_sgpr3 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...
-
----
-name:            stack-slot-color-after-sgpr-alloc-2
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-2
-    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr2, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr2 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-2
-    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S32_SAVE $sgpr2, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr2 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-2
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_64 = COPY $sgpr2_sgpr3
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr2_sgpr3 = COPY %5
-    %6:sreg_32 = COPY $sgpr2
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...
-
----
-name:            stack-slot-color-after-sgpr-alloc-3
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-machineFunctionInfo:
-  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  frameOffsetReg: $sgpr32
-  stackPtrOffsetReg: $sgpr32
-body:             |
-  bb.0:
-    ; BEFORE-LABEL: name: stack-slot-color-after-sgpr-alloc-3
-    ; BEFORE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
-    ; BEFORE-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; BEFORE-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
-    ; BEFORE-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; BEFORE-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; BEFORE-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; BEFORE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; BEFORE-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
-    ; BEFORE-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
-    ; BEFORE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; AFTER-LABEL: name: stack-slot-color-after-sgpr-alloc-3
-    ; AFTER: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S32_SAVE $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
-    ; AFTER-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; AFTER-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
-    ; AFTER-NEXT: SI_SPILL_S64_SAVE $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; AFTER-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; AFTER-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; AFTER-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit killed $vgpr0
-    ; AFTER-NEXT: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5)
-    ; AFTER-NEXT: $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; AFTER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ;
-    ; SGPR_SPILLED-LABEL: name: stack-slot-color-after-sgpr-alloc-3
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
-    ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; SGPR_SPILLED-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-    ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
-    ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
-    ; SGPR_SPILLED-NEXT: renamable $vgpr0 = SI_SPILL_WWM_AV32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    %0:sreg_32_xm0 = COPY $sgpr32
-    %5:sreg_32 = COPY $sgpr0
-    %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
-    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
-    $sgpr32 = COPY %0
-    %4:sreg_32_xm0 = COPY $sgpr32
-    $sgpr0 = COPY %5
-    %6:sreg_64 = COPY $sgpr2_sgpr3
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
-    $sgpr32 = COPY %4
-    $sgpr2_sgpr3 = COPY %6
-    ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-...

>From 3775a494d5106028e7b55b19deaa8d236088d684 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Mon, 17 Jun 2024 17:47:29 +0530
Subject: [PATCH 9/9] Update LIT mir test case with optimized SGPR spill.

---
 .../si-lower-sgpr-spills-vgpr-lanes-usage.mir | 42 +++++++++----------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 887e9c4b5dc5e..0527036e67498 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -2,6 +2,7 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy,0 -stop-after=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILLED %s
 
 # INFO : The test starts from the sgpr-regalloc pipeline.
+# INFO : Now, StackSlotColoring pass comes just after sgpr-regalloc pipeline.
 
 # This file contains 3 test cases to observe the optimal stack slot usage for SGPR spills utilizing Stack Slot Coloring pass.
 # @stack-slot-share-equal-sized-spills : In this, the stack slot indices is shared among the spill stack objects of equal size.
@@ -41,20 +42,19 @@ body:             |
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr0_sgpr1
     ; SGPR_SPILLED-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, [[DEF]], implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr0_sgpr1
@@ -107,18 +107,17 @@ body:             |
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, [[DEF]]
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]]
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_64 = COPY $sgpr2_sgpr3
@@ -163,26 +162,25 @@ body:             |
     ; SGPR_SPILLED-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr
     ; SGPR_SPILLED-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) 4, target-flags(amdgpu-rel32-hi) 4, implicit-def dead $scc
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, [[DEF]], implicit killed $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, [[DEF]], implicit killed $sgpr4_sgpr5
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $vgpr0
     ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, [[DEF]]
     ; SGPR_SPILLED-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, [[DEF]], implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ; SGPR_SPILLED-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2, implicit-def $sgpr4_sgpr5
-    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+    ; SGPR_SPILLED-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3, implicit-def $sgpr4_sgpr5
+    ; SGPR_SPILLED-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
     ; SGPR_SPILLED-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit killed $vgpr0
-    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
-    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5, implicit-def $sgpr2_sgpr3
-    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+    ; SGPR_SPILLED-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+    ; SGPR_SPILLED-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr2_sgpr3
+    ; SGPR_SPILLED-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
     ; SGPR_SPILLED-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     %0:sreg_32_xm0 = COPY $sgpr32
     %5:sreg_32 = COPY $sgpr0



More information about the llvm-commits mailing list