[llvm] [AMDGPU] Implemented a patch to optimize SGPR spills (PR #93668)
Vikash Gupta via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 05:47:42 PDT 2024
https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/93668
>From 7e610af395db533f1d54bbd6081d3c05c52ebada Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Mon, 27 May 2024 17:44:27 +0530
Subject: [PATCH 1/2] [WIP] Implemented a patch to optimize SGPR spills.
Introduced the StackSlotColoring pass after SGPR RegAlloc and Spill to optimize stack slots reusage.
---
llvm/lib/CodeGen/StackSlotColoring.cpp | 18 ++-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 20 ++-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 6 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 8 ++
.../AMDGPU/preserve-wwm-copy-dst-reg.ll | 72 +++++-----
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 8 ++
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 84 ++++++------
...er-sgpr-alloc-equal-size-stack-objects.mir | 127 ++++++++++++++++++
...gpr-alloc-unequal-size-stack-objects-2.mir | 122 +++++++++++++++++
...-sgpr-alloc-unequal-size-stack-objects.mir | 123 +++++++++++++++++
11 files changed, 507 insertions(+), 84 deletions(-)
create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
create mode 100755 llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 9fdc8a338b52a..00cc6e60a1123 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalUnion.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -64,6 +65,7 @@ namespace {
MachineFrameInfo *MFI = nullptr;
const TargetInstrInfo *TII = nullptr;
const MachineBlockFrequencyInfo *MBFI = nullptr;
+ SlotIndexes *Indexes = nullptr;
// SSIntervals - Spill slot intervals.
std::vector<LiveInterval*> SSIntervals;
@@ -152,6 +154,14 @@ namespace {
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<MachineBlockFrequencyInfo>();
AU.addPreservedID(MachineDominatorsID);
+
+ /// NOTE: As in AMDGPU pass pipeline, reg alloc is spillted into 2 phases
+ /// and StackSlotColoring is invoked after each phase, it becomes
+ /// important to preserve additional analyses result to be used by VGPR
+ /// regAlloc, after being done with SGPR regAlloc and its related passes.
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<LiveDebugVariables>();
+
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -496,8 +506,13 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
++I;
}
- for (MachineInstr *MI : toErase)
+ /// FIXED: As this pass preserves SlotIndexesAnalysis result, any
+ /// addition/removal of MI needs corresponding update in SlotIndexAnalysis,
+ /// to avoid corruption of SlotIndexesAnalysis result.
+ for (MachineInstr *MI : toErase) {
MI->eraseFromParent();
+ Indexes->removeMachineInstrFromMaps(*MI);
+ }
return changed;
}
@@ -515,6 +530,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget().getInstrInfo();
LS = &getAnalysis<LiveStacks>();
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+ Indexes = &getAnalysis<SlotIndexes>();
bool Changed = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..728cf4fe0281a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1406,6 +1406,9 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// since FastRegAlloc does the replacements itself.
addPass(createVirtRegRewriter(false));
+ // Optimizes SGPR spills into VGPR lanes for non-interferring spill-ranges.
+ addPass(&StackSlotColoringID);
+
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
addPass(&SIPreAllocateWWMRegsID);
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index b6a0152f6fa83..9e121b47ad3fb 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -52,7 +52,8 @@ class SILowerSGPRSpills : public MachineFunctionPass {
void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
- void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
+ void extendWWMVirtRegLiveness(MachineFunction &MF, SlotIndexes *Indexes,
+ LiveIntervals *LIS);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -260,6 +261,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
+ SlotIndexes *Indexes,
LiveIntervals *LIS) {
// TODO: This is a workaround to avoid the unmodelled liveness computed with
// whole-wave virtual registers when allocated together with the regular VGPR
@@ -278,14 +280,21 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+ MachineInstrSpan MIS(InsertBefore, SaveBlock);
+
DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
// Set SGPR_SPILL asm printer flag
MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
+
if (LIS) {
LIS->InsertMachineInstrInMaps(*MIB);
+ } else if (Indexes) {
+ assert(std::distance(MIS.begin(), InsertBefore) == 1);
+ MachineInstr &Inst = *std::prev(InsertBefore);
+ Indexes->insertMachineInstrInMaps(Inst);
}
}
}
@@ -300,8 +309,13 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
TII->get(TargetOpcode::KILL));
MIB.addReg(Reg);
- if (LIS)
+
+ if (LIS) {
LIS->InsertMachineInstrInMaps(*MIB);
+ } else if (Indexes) {
+ MachineInstr &Inst = *std::prev(InsertBefore);
+ Indexes->insertMachineInstrInMaps(Inst);
+ }
}
}
}
@@ -392,7 +406,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
}
if (SpilledToVirtVGPRLanes) {
- extendWWMVirtRegLiveness(MF, LIS);
+ extendWWMVirtRegLiveness(MF, Indexes, LIS);
if (LIS) {
// Compute the LiveInterval for the newly created virtual registers.
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4b5f9bdd82b8d..e5829e8193218 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,8 +1775,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
if (SpillToVGPR) {
- assert(SB.NumSubRegs == VGPRSpills.size() &&
- "Num of VGPR lanes should be equal to num of SGPRs spilled");
+ assert(SB.NumSubRegs <= VGPRSpills.size() &&
+ "Num of VGPR lanes should be greater or equal to num of SGPRs "
+ "spilled, as Stack Slot Coloring pass assigns different SGPR spills "
+ "into same stack slots");
for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
Register SubReg =
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfa..882eab9ba761f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -362,10 +362,12 @@
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: Virtual Register Rewriter
+; GCN-O1-NEXT: Stack Slot Coloring
; GCN-O1-NEXT: SI lower SGPR spill instructions
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
+; GCN-O1-NEXT: Live Stack Slot Analysis
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: GCN NSA Reassign
@@ -665,10 +667,12 @@
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT: Stack Slot Coloring
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
+; GCN-O1-OPTS-NEXT: Live Stack Slot Analysis
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
@@ -974,10 +978,12 @@
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: Virtual Register Rewriter
+; GCN-O2-NEXT: Stack Slot Coloring
; GCN-O2-NEXT: SI lower SGPR spill instructions
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
+; GCN-O2-NEXT: Live Stack Slot Analysis
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: GCN NSA Reassign
@@ -1295,10 +1301,12 @@
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: Virtual Register Rewriter
+; GCN-O3-NEXT: Stack Slot Coloring
; GCN-O3-NEXT: SI lower SGPR spill instructions
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
+; GCN-O3-NEXT: Live Stack Slot Analysis
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index fbe34a3a3970b..25e9e09748c81 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: ; def s29
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: v_writelane_b32 v40, s21, 24
-; GFX906-NEXT: v_writelane_b32 v40, s22, 25
-; GFX906-NEXT: v_writelane_b32 v40, s23, 26
-; GFX906-NEXT: v_writelane_b32 v40, s24, 27
-; GFX906-NEXT: v_writelane_b32 v40, s25, 28
-; GFX906-NEXT: v_writelane_b32 v40, s26, 29
-; GFX906-NEXT: v_writelane_b32 v40, s27, 30
-; GFX906-NEXT: v_writelane_b32 v40, s28, 31
-; GFX906-NEXT: v_writelane_b32 v40, s29, 32
+; GFX906-NEXT: v_writelane_b32 v40, s21, 12
+; GFX906-NEXT: v_writelane_b32 v40, s22, 13
+; GFX906-NEXT: v_writelane_b32 v40, s23, 14
+; GFX906-NEXT: v_writelane_b32 v40, s24, 15
+; GFX906-NEXT: v_writelane_b32 v40, s25, 16
+; GFX906-NEXT: v_writelane_b32 v40, s26, 17
+; GFX906-NEXT: v_writelane_b32 v40, s27, 18
+; GFX906-NEXT: v_writelane_b32 v40, s28, 19
+; GFX906-NEXT: v_writelane_b32 v40, s29, 20
; GFX906-NEXT: v_readlane_b32 s4, v40, 10
; GFX906-NEXT: v_readlane_b32 s6, v40, 0
; GFX906-NEXT: v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
-; GFX906-NEXT: v_readlane_b32 s21, v40, 24
+; GFX906-NEXT: v_readlane_b32 s21, v40, 12
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s21
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s22, v40, 25
+; GFX906-NEXT: v_readlane_b32 s22, v40, 13
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s22
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s23, v40, 26
+; GFX906-NEXT: v_readlane_b32 s23, v40, 14
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s23
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s24, v40, 27
+; GFX906-NEXT: v_readlane_b32 s24, v40, 15
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s24
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s25, v40, 28
+; GFX906-NEXT: v_readlane_b32 s25, v40, 16
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s25
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s26, v40, 29
+; GFX906-NEXT: v_readlane_b32 s26, v40, 17
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s26
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s27, v40, 30
+; GFX906-NEXT: v_readlane_b32 s27, v40, 18
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s27
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s28, v40, 31
+; GFX906-NEXT: v_readlane_b32 s28, v40, 19
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s28
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s29, v40, 32
+; GFX906-NEXT: v_readlane_b32 s29, v40, 20
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s29
; GFX906-NEXT: ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: ; def s29
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT: v_writelane_b32 v40, s21, 24
-; GFX908-NEXT: v_writelane_b32 v40, s22, 25
-; GFX908-NEXT: v_writelane_b32 v40, s23, 26
-; GFX908-NEXT: v_writelane_b32 v40, s24, 27
-; GFX908-NEXT: v_writelane_b32 v40, s25, 28
-; GFX908-NEXT: v_writelane_b32 v40, s26, 29
-; GFX908-NEXT: v_writelane_b32 v40, s27, 30
-; GFX908-NEXT: v_writelane_b32 v40, s28, 31
-; GFX908-NEXT: v_writelane_b32 v40, s29, 32
+; GFX908-NEXT: v_writelane_b32 v40, s21, 12
+; GFX908-NEXT: v_writelane_b32 v40, s22, 13
+; GFX908-NEXT: v_writelane_b32 v40, s23, 14
+; GFX908-NEXT: v_writelane_b32 v40, s24, 15
+; GFX908-NEXT: v_writelane_b32 v40, s25, 16
+; GFX908-NEXT: v_writelane_b32 v40, s26, 17
+; GFX908-NEXT: v_writelane_b32 v40, s27, 18
+; GFX908-NEXT: v_writelane_b32 v40, s28, 19
+; GFX908-NEXT: v_writelane_b32 v40, s29, 20
; GFX908-NEXT: v_readlane_b32 s4, v40, 10
; GFX908-NEXT: v_readlane_b32 s6, v40, 0
; GFX908-NEXT: v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
-; GFX908-NEXT: v_readlane_b32 s21, v40, 24
+; GFX908-NEXT: v_readlane_b32 s21, v40, 12
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s21
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s22, v40, 25
+; GFX908-NEXT: v_readlane_b32 s22, v40, 13
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s22
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s23, v40, 26
+; GFX908-NEXT: v_readlane_b32 s23, v40, 14
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s23
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s24, v40, 27
+; GFX908-NEXT: v_readlane_b32 s24, v40, 15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s24
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s25, v40, 28
+; GFX908-NEXT: v_readlane_b32 s25, v40, 16
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s25
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s26, v40, 29
+; GFX908-NEXT: v_readlane_b32 s26, v40, 17
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s26
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s27, v40, 30
+; GFX908-NEXT: v_readlane_b32 s27, v40, 18
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s27
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s28, v40, 31
+; GFX908-NEXT: v_readlane_b32 s28, v40, 19
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s28
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s29, v40, 32
+; GFX908-NEXT: v_readlane_b32 s29, v40, 20
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s29
; GFX908-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 17a19116735e4..04a9f3cb2d332 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -17,10 +17,12 @@
; DEFAULT: Greedy Register Allocator
; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: Stack Slot Coloring
; DEFAULT-NEXT: SI lower SGPR spill instructions
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-NEXT: Live Stack Slot Analysis
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: GCN NSA Reassign
@@ -50,10 +52,12 @@
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Basic Register Allocator
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: Stack Slot Coloring
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -69,10 +73,12 @@
; DEFAULT-BASIC: Greedy Register Allocator
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: Stack Slot Coloring
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -90,10 +96,12 @@
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: Stack Slot Coloring
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
+; BASIC-BASIC-NEXT: Live Stack Slot Analysis
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index bea2e6d4b45a3..e1bd1523d78a4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10098,7 +10098,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT: s_mov_b32 s2, 0x84400
+; GFX6-NEXT: s_mov_b32 s2, 0x86a00
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10108,7 +10108,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT: s_mov_b32 s2, 0x84000
+; GFX6-NEXT: s_mov_b32 s2, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10117,7 +10117,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT: s_mov_b32 s2, 0x83c00
+; GFX6-NEXT: s_mov_b32 s2, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10126,7 +10126,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT: s_mov_b32 s2, 0x83800
+; GFX6-NEXT: s_mov_b32 s2, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10135,7 +10135,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT: s_mov_b32 s2, 0x83400
+; GFX6-NEXT: s_mov_b32 s2, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10144,7 +10144,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT: s_mov_b32 s2, 0x83000
+; GFX6-NEXT: s_mov_b32 s2, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10153,7 +10153,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT: s_mov_b32 s2, 0x82c00
+; GFX6-NEXT: s_mov_b32 s2, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10162,7 +10162,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT: s_mov_b32 s2, 0x82800
+; GFX6-NEXT: s_mov_b32 s2, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10171,7 +10171,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT: s_mov_b32 s2, 0x82400
+; GFX6-NEXT: s_mov_b32 s2, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10180,7 +10180,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT: s_mov_b32 s2, 0x82000
+; GFX6-NEXT: s_mov_b32 s2, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10189,7 +10189,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT: s_mov_b32 s2, 0x81c00
+; GFX6-NEXT: s_mov_b32 s2, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10198,7 +10198,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT: s_mov_b32 s2, 0x81400
+; GFX6-NEXT: s_mov_b32 s2, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10208,7 +10208,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT: s_mov_b32 s2, 0x80c00
+; GFX6-NEXT: s_mov_b32 s2, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10217,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT: s_mov_b32 s2, 0x81000
+; GFX6-NEXT: s_mov_b32 s2, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10239,7 +10239,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT: s_mov_b32 s0, 0x81800
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10266,7 +10266,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s9, 5
; GFX6-NEXT: v_writelane_b32 v4, s10, 6
; GFX6-NEXT: v_writelane_b32 v4, s11, 7
-; GFX6-NEXT: s_mov_b32 s2, 0x84800
+; GFX6-NEXT: s_mov_b32 s2, 0x80c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10307,7 +10307,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s13, 5
; GFX6-NEXT: v_writelane_b32 v4, s14, 6
; GFX6-NEXT: v_writelane_b32 v4, s15, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x85000
+; GFX6-NEXT: s_mov_b32 s34, 0x81400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10316,7 +10316,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x84800
+; GFX6-NEXT: s_mov_b32 s34, 0x80c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10343,7 +10343,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s21, 5
; GFX6-NEXT: v_writelane_b32 v4, s22, 6
; GFX6-NEXT: v_writelane_b32 v4, s23, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x85800
+; GFX6-NEXT: s_mov_b32 s34, 0x81c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10352,7 +10352,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x85000
+; GFX6-NEXT: s_mov_b32 s34, 0x81400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10379,7 +10379,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s29, 5
; GFX6-NEXT: v_writelane_b32 v4, s30, 6
; GFX6-NEXT: v_writelane_b32 v4, s31, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x86000
+; GFX6-NEXT: s_mov_b32 s34, 0x82400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10388,7 +10388,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x85800
+; GFX6-NEXT: s_mov_b32 s34, 0x81c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10411,7 +10411,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s1, 1
; GFX6-NEXT: v_writelane_b32 v4, s2, 2
; GFX6-NEXT: v_writelane_b32 v4, s3, 3
-; GFX6-NEXT: s_mov_b32 s34, 0x86800
+; GFX6-NEXT: s_mov_b32 s34, 0x82c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10423,7 +10423,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s4, 0
; GFX6-NEXT: v_writelane_b32 v4, s5, 1
-; GFX6-NEXT: s_mov_b32 s2, 0x86c00
+; GFX6-NEXT: s_mov_b32 s2, 0x83000
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
@@ -10432,7 +10432,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s36, 0x86000
+; GFX6-NEXT: s_mov_b32 s36, 0x82400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10450,7 +10450,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s44, 0x86800
+; GFX6-NEXT: s_mov_b32 s44, 0x82c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10464,7 +10464,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[44:45], exec
; GFX6-NEXT: s_mov_b64 exec, 3
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0
+; GFX6-NEXT: v_mov_b32_e32 v7, 0x20c0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10521,13 +10521,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b32 s0, 0x84400
+; GFX6-NEXT: s_mov_b32 s0, 0x86a00
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT: s_mov_b32 s0, 0x84000
+; GFX6-NEXT: s_mov_b32 s0, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10535,7 +10535,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83c00
+; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10543,7 +10543,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83800
+; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10551,7 +10551,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83400
+; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10559,7 +10559,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83000
+; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10567,7 +10567,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x82c00
+; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10575,7 +10575,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x82800
+; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10583,7 +10583,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x82400
+; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10591,7 +10591,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x82000
+; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10599,7 +10599,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x81c00
+; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10607,7 +10607,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x81400
+; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10615,7 +10615,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x81800
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10623,7 +10623,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x81000
+; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10631,7 +10631,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x80c00
+; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
new file mode 100755
index 0000000000000..e8651cd6944d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-equal-size-stack-objects.mir
@@ -0,0 +1,127 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+ define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+ bb:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ call void @func(i32 undef)
+ call void @func(i32 %tmp)
+ unreachable
+ }
+
+ declare void @func(i32)
+...
+
+
+---
+name: stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
+ hasCalls: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ frameOffsetReg: $sgpr32
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; SHARE-NEXT: {{ $}}
+ ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+ ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+ ; SHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+ ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ;
+ ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; NOSHARE-NEXT: {{ $}}
+ ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, killed $vgpr2, implicit $sgpr0_sgpr1
+ ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+ ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr0_sgpr1
+ ; NOSHARE-NEXT: $sgpr1 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 7, killed $vgpr1, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+ ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6, implicit-def $sgpr2_sgpr3
+ ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ %0:sreg_32_xm0 = COPY $sgpr32
+ %5:sreg_64 = COPY $sgpr0_sgpr1
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+ %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+ $sgpr32 = COPY %0
+ %4:sreg_32_xm0 = COPY $sgpr32
+ $sgpr0_sgpr1 = COPY %5
+ %6:sreg_64 = COPY $sgpr2_sgpr3
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ $vgpr0 = COPY %2
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+ $sgpr32 = COPY %4
+ $sgpr2_sgpr3 = COPY %6
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
new file mode 100755
index 0000000000000..f20dee490a83c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects-2.mir
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+ define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+ bb:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ call void @func(i32 undef)
+ call void @func(i32 %tmp)
+ unreachable
+ }
+
+ declare void @func(i32)
+...
+
+---
+name: stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
+ hasCalls: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ frameOffsetReg: $sgpr32
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; SHARE-NEXT: {{ $}}
+ ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+ ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; SHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr1, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1, implicit-def $sgpr2_sgpr3
+ ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ;
+ ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; NOSHARE-NEXT: {{ $}}
+ ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, killed $vgpr2
+ ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 3, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 3
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 4, killed $vgpr1
+ ; NOSHARE-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 5, killed $vgpr1, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr3, 6, killed $vgpr1, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+ ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5, implicit-def $sgpr2_sgpr3
+ ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ %0:sreg_32_xm0 = COPY $sgpr32
+ %5:sreg_32 = COPY $sgpr0
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+ %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+ $sgpr32 = COPY %0
+ %4:sreg_32_xm0 = COPY $sgpr32
+ $sgpr0 = COPY %5
+ %6:sreg_64 = COPY $sgpr2_sgpr3
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ $vgpr0 = COPY %2
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+ $sgpr32 = COPY %4
+ $sgpr2_sgpr3 = COPY %6
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
new file mode 100755
index 0000000000000..e2f1d3fd0a0af
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-alloc-unequal-size-stack-objects.mir
@@ -0,0 +1,123 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -no-stack-slot-sharing -o - %s | FileCheck -check-prefix=NOSHARE %s
+
+--- |
+
+ define void @stack-slot-color-after-sgpr-alloc(ptr addrspace(1) nocapture readnone %arg, ptr addrspace(1) noalias %arg1) {
+ bb:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ call void @func(i32 undef)
+ call void @func(i32 %tmp)
+ unreachable
+ }
+
+ declare void @func(i32)
+...
+
+
+---
+name: stack-slot-color-after-sgpr-alloc
+tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
+ hasCalls: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ frameOffsetReg: $sgpr32
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ ; SHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; SHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; SHARE-NEXT: {{ $}}
+ ; SHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; SHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+ ; SHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; SHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; SHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; SHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; SHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+ ; SHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+ ; SHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr1
+ ; SHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; SHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; SHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+ ; SHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+ ; SHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; SHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; SHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; SHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
+ ; SHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ;
+ ; NOSHARE-LABEL: name: stack-slot-color-after-sgpr-alloc
+ ; NOSHARE: liveins: $sgpr30, $sgpr31, $vgpr63
+ ; NOSHARE-NEXT: {{ $}}
+ ; NOSHARE-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ ; NOSHARE-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, killed $vgpr2
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, killed $vgpr2, implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, killed $vgpr2, implicit $sgpr2_sgpr3
+ ; NOSHARE-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; NOSHARE-NEXT: renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; NOSHARE-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 3, killed $vgpr2, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; NOSHARE-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 4, killed $vgpr2, implicit killed $sgpr4_sgpr5
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr2, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 4
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr32, 5, killed $vgpr1
+ ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1, implicit-def $sgpr2_sgpr3
+ ; NOSHARE-NEXT: $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
+ ; NOSHARE-NEXT: renamable $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr2, 6, killed $vgpr1
+ ; NOSHARE-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr1, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ; NOSHARE-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3, implicit-def $sgpr4_sgpr5
+ ; NOSHARE-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 4
+ ; NOSHARE-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0
+ ; NOSHARE-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; NOSHARE-NEXT: $sgpr32 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+ ; NOSHARE-NEXT: $sgpr2 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 6
+ ; NOSHARE-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ %0:sreg_32_xm0 = COPY $sgpr32
+ %5:sreg_64 = COPY $sgpr2_sgpr3
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr
+ %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit undef $vgpr0
+ $sgpr32 = COPY %0
+ %4:sreg_32_xm0 = COPY $sgpr32
+ $sgpr2_sgpr3 = COPY %5
+ %6:sreg_32 = COPY $sgpr2
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+ $vgpr0 = COPY %2
+ dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu, implicit killed $vgpr0
+ $sgpr32 = COPY %4
+ $sgpr2 = COPY %6
+ ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
+...
>From 6da7a6eb90a7148a4fb8a6dfb7ed36f3981e7611 Mon Sep 17 00:00:00 2001
From: vg0204 <Vikash.Gupta at amd.com>
Date: Wed, 29 May 2024 16:17:02 +0530
Subject: [PATCH 2/2] Added configuration flag for StackSlotColoring &
refactored comments.
---
llvm/include/llvm/CodeGen/Passes.h | 2 ++
llvm/lib/CodeGen/StackSlotColoring.cpp | 29 ++++++++++++-------
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +++--
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 20 +++----------
4 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index f850767270a4f..172583bc76fa8 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -371,6 +371,8 @@ namespace llvm {
/// StackSlotColoring - This pass performs stack slot coloring.
extern char &StackSlotColoringID;
+ FunctionPass *
+ createStackSlotColoring(bool preserveRegAllocNeededAnalysis = false);
/// This pass lays out funclets contiguously.
extern char &FuncletLayoutID;
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 00cc6e60a1123..ab58e3dd369cf 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -67,6 +67,9 @@ namespace {
const MachineBlockFrequencyInfo *MBFI = nullptr;
SlotIndexes *Indexes = nullptr;
+ // - preserves Analysis passes in case RA may be called afterwards.
+ bool preserveRegAllocNeededAnalysis = false;
+
// SSIntervals - Spill slot intervals.
std::vector<LiveInterval*> SSIntervals;
@@ -142,7 +145,9 @@ namespace {
public:
static char ID; // Pass identification
- StackSlotColoring() : MachineFunctionPass(ID) {
+ StackSlotColoring(bool preserveRegAllocNeededAnalysis_ = false)
+ : MachineFunctionPass(ID),
+ preserveRegAllocNeededAnalysis(preserveRegAllocNeededAnalysis_) {
initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
}
@@ -155,12 +160,14 @@ namespace {
AU.addPreserved<MachineBlockFrequencyInfo>();
AU.addPreservedID(MachineDominatorsID);
- /// NOTE: As in AMDGPU pass pipeline, reg alloc is spillted into 2 phases
- /// and StackSlotColoring is invoked after each phase, it becomes
- /// important to preserve additional analyses result to be used by VGPR
- /// regAlloc, after being done with SGPR regAlloc and its related passes.
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<LiveDebugVariables>();
+ // As in some Target's pipeline, register allocation (RA) might be
+ // splitted into multiple phases based on register class. So, this pass
+ // may be invoked multiple times requiring it to save these analyses to be
+ // used by RA later.
+ if (preserveRegAllocNeededAnalysis) {
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<LiveDebugVariables>();
+ }
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -506,9 +513,6 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
++I;
}
- /// FIXED: As this pass preserves SlotIndexesAnalysis result, any
- /// addition/removal of MI needs corresponding update in SlotIndexAnalysis,
- /// to avoid corruption of SlotIndexesAnalysis result.
for (MachineInstr *MI : toErase) {
MI->eraseFromParent();
Indexes->removeMachineInstrFromMaps(*MI);
@@ -565,3 +569,8 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
+
+FunctionPass *
+llvm::createStackSlotColoring(bool preserveRegAllocNeededAnalysis) {
+ return new StackSlotColoring(preserveRegAllocNeededAnalysis);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 728cf4fe0281a..2a4b9c2f87e65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1406,8 +1406,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// since FastRegAlloc does the replacements itself.
addPass(createVirtRegRewriter(false));
- // Optimizes SGPR spills into VGPR lanes for non-interferring spill-ranges.
- addPass(&StackSlotColoringID);
+ // As by this point SGPR's RA is done introducing SGPR spills to stack frame
+ // through SGPRAllocPass. So, invoking StackSlotColoring here, may allow these
+ // SGPR spills to re-use stack slots, before these spills is further lowered
+ // down via SILowerSGPRSpills(i.e. equivalent of PEI for SGPRs).
+ addPass(createStackSlotColoring(/*preserveRegAllocNeededAnalysis*/ true));
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9e121b47ad3fb..fa3aa9d7091b7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -52,8 +52,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
- void extendWWMVirtRegLiveness(MachineFunction &MF, SlotIndexes *Indexes,
- LiveIntervals *LIS);
+ void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -261,7 +260,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
- SlotIndexes *Indexes,
LiveIntervals *LIS) {
// TODO: This is a workaround to avoid the unmodelled liveness computed with
// whole-wave virtual registers when allocated together with the regular VGPR
@@ -280,7 +278,6 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
- MachineInstrSpan MIS(InsertBefore, SaveBlock);
DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
@@ -289,13 +286,8 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
// Set SGPR_SPILL asm printer flag
MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
- if (LIS) {
+ if (LIS)
LIS->InsertMachineInstrInMaps(*MIB);
- } else if (Indexes) {
- assert(std::distance(MIS.begin(), InsertBefore) == 1);
- MachineInstr &Inst = *std::prev(InsertBefore);
- Indexes->insertMachineInstrInMaps(Inst);
- }
}
}
@@ -310,12 +302,8 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
TII->get(TargetOpcode::KILL));
MIB.addReg(Reg);
- if (LIS) {
+ if (LIS)
LIS->InsertMachineInstrInMaps(*MIB);
- } else if (Indexes) {
- MachineInstr &Inst = *std::prev(InsertBefore);
- Indexes->insertMachineInstrInMaps(Inst);
- }
}
}
}
@@ -406,7 +394,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
}
if (SpilledToVirtVGPRLanes) {
- extendWWMVirtRegLiveness(MF, Indexes, LIS);
+ extendWWMVirtRegLiveness(MF, LIS);
if (LIS) {
// Compute the LiveInterval for the newly created virtual registers.
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
More information about the llvm-commits
mailing list