[llvm] [AMDGPU][SILowerSGPRSpills] Correct insertion of IMPLICIT_DEF in cycles (PR #186348)
Chris Jackson via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 09:14:54 PDT 2026
https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/186348
>From b1a1b7785739009288822c1fdfc59ec059fa9f0c Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 13 Mar 2026 04:22:48 -0500
Subject: [PATCH 01/12] [AMDGPU][RegAlloc] Correct insertion of IMPLICIT_DEF in
loop headers
si-lower-sgpr spills was observed inserting IMPLICIT_DEF for lane VGPR
restores in the loop header. The virtual VGPR is therefore not live-in
to the header and wwm regallocfast does not insert a restore. This
results in the vgpr being clobbered after each backedge.
Correct this by inserting the IMPLICIT_DEF in the preheader.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 34 ++++-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 1 +
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 1 +
.../si-lower-sgpr-spills-loop-preheader.mir | 117 ++++++++++++++++++
.../AMDGPU/spill-sgpr-to-virtual-vgpr.mir | 6 +-
5 files changed, 152 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 196e551932659..4930c49a195c0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/InitializePasses.h"
@@ -46,16 +47,19 @@ class SILowerSGPRSpills {
LiveIntervals *LIS = nullptr;
SlotIndexes *Indexes = nullptr;
MachineDominatorTree *MDT = nullptr;
+ MachineLoopInfo *MLI = nullptr;
// Save and Restore blocks of the current function. Typically there is a
// single save block, unless Windows EH funclets are involved.
MBBVector SaveBlocks;
MBBVector RestoreBlocks;
+ MachineBasicBlock *getLoopHeaderDomBB(MachineBasicBlock *MBB);
+
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
- MachineDominatorTree *MDT)
- : LIS(LIS), Indexes(Indexes), MDT(MDT) {}
+ MachineDominatorTree *MDT, MachineLoopInfo *MLI)
+ : LIS(LIS), Indexes(Indexes), MDT(MDT), MLI(MLI) {}
bool run(MachineFunction &MF);
void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
@@ -76,6 +80,7 @@ class SILowerSGPRSpillsLegacy : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -95,6 +100,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
@@ -299,6 +305,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
return false;
}
+MachineBasicBlock *
+SILowerSGPRSpills::getLoopHeaderDomBB(MachineBasicBlock *MBB) {
+ // Only redirect from the loop header; other blocks in the loop must keep
+ // their insertion point (e.g. latch blocks are not loop headers).
+ if (MLI->isLoopHeader(MBB))
+ if (MachineLoop *L = MLI->getLoopFor(MBB))
+ if (MachineBasicBlock *Preheader = MLI->findLoopPreheader(L))
+ return Preheader;
+
+ return MBB;
+}
+
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
@@ -340,6 +358,12 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// Find the common dominator block between PrevInsertPt and the
// current spill.
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
+ // If the insertion point is a loop header move it to the preheader.
+ // This ensures the IMPLICIT_DEF dominates and is live-in to the header.
+ // WWM regalloc will then insert a wwm restore in the header.
+ if (MLI->isLoopHeader(DomMBB))
+ DomMBB = getLoopHeaderDomBB(DomMBB);
+
if (DomMBB == MBB)
I->second = InsertPt;
else if (DomMBB != PrevInsertPt->getParent())
@@ -393,7 +417,8 @@ bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr;
MachineDominatorTree *MDT =
&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- return SILowerSGPRSpills(LIS, Indexes, MDT).run(MF);
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ return SILowerSGPRSpills(LIS, Indexes, MDT, MLI).run(MF);
}
bool SILowerSGPRSpills::run(MachineFunction &MF) {
@@ -581,6 +606,7 @@ SILowerSGPRSpillsPass::run(MachineFunction &MF,
auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
auto *Indexes = MFAM.getCachedResult<SlotIndexesAnalysis>(MF);
MachineDominatorTree *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
- SILowerSGPRSpills(LIS, Indexes, MDT).run(MF);
+ MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+ SILowerSGPRSpills(LIS, Indexes, MDT, MLI).run(MF);
return PreservedAnalyses::all();
}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 81b9aae775ed8..7c81a0659acce 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -118,6 +118,7 @@
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
+; GCN-O0-NEXT: Machine Natural Loop Construction
; GCN-O0-NEXT: SI lower SGPR spill instructions
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index c83af33659dad..ef0d6a942c585 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -37,6 +37,7 @@
; DEFAULT-NEXT: Stack Slot Coloring
; O0: Fast Register Allocator
+; O0-NEXT: Machine Natural Loop Construction
; O0-NEXT: SI lower SGPR spill instructions
; O0-NEXT: Slot index numbering
; O0-NEXT: Live Interval Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
new file mode 100644
index 0000000000000..21d8c7762b0d7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
@@ -0,0 +1,117 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
+
+
+# When SGPR spills to a virtual VGPR lane occur in both a loop header and the latch,
+# the IMPLICIT_DEF for the lane VGPR must be placed in the preheader (not the header).
+# Establish that the virtual VGPR is live-in to the header and wwm regallocfast inserts
+# a restore, preserving the latch writes.
+
+---
+name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; SGPR-SPILL-LABEL: name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+ ; SGPR-SPILL: bb.0:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_NOP 0
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.1:
+ ; SGPR-SPILL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.3
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.2:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: $sgpr10 = S_MOV_B32 1
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.3:
+ ; SGPR-SPILL-NEXT: liveins: $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_SETPC_B64 $sgpr30_sgpr31
+ ;
+ ; WWM-REGALLOC-LABEL: name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+ ; WWM-REGALLOC: bb.0:
+ ; WWM-REGALLOC-NEXT: successors: %bb.1(0x80000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_NOP 0
+ ; WWM-REGALLOC-NEXT: renamable $vgpr63 = IMPLICIT_DEF
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.1
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.1:
+ ; WWM-REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-REGALLOC-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.3
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.2:
+ ; WWM-REGALLOC-NEXT: successors: %bb.1(0x80000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: dead $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-REGALLOC-NEXT: $sgpr10 = S_MOV_B32 1
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.1
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.3:
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_SETPC_B64 killed $sgpr30_sgpr31
+
+
+ bb.0:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_NOP 0
+ S_BRANCH %bb.1
+ bb.1:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ S_BRANCH %bb.3
+ bb.2:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ $sgpr10 = S_MOV_B32 1
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_BRANCH %bb.1
+ bb.3:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64 $sgpr30_sgpr31
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index fa3fd3bc6da5b..bb47603647733 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -241,13 +241,14 @@ body: |
; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
; GCN-NEXT: S_BRANCH %bb.2
; GCN-NEXT: {{ $}}
@@ -255,7 +256,7 @@ body: |
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
@@ -264,7 +265,6 @@ body: |
; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr10 = S_MOV_B32 10
- ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
>From 66587862783fff74038bc3f1c33994f2a18fcfc5 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Mar 2026 10:33:43 -0500
Subject: [PATCH 02/12] Remove Machine Loop info in favour of Machine Cycle
Info.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 42 +++++++++----------
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 6 ++-
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 6 ++-
3 files changed, 31 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 4930c49a195c0..f04e20d387c58 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -21,9 +21,9 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/InitializePasses.h"
@@ -47,19 +47,19 @@ class SILowerSGPRSpills {
LiveIntervals *LIS = nullptr;
SlotIndexes *Indexes = nullptr;
MachineDominatorTree *MDT = nullptr;
- MachineLoopInfo *MLI = nullptr;
+ MachineCycleInfo *MCI = nullptr;
// Save and Restore blocks of the current function. Typically there is a
// single save block, unless Windows EH funclets are involved.
MBBVector SaveBlocks;
MBBVector RestoreBlocks;
- MachineBasicBlock *getLoopHeaderDomBB(MachineBasicBlock *MBB);
+ MachineBasicBlock *getCycleHeaderDomBB(MachineBasicBlock *MBB);
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
- MachineDominatorTree *MDT, MachineLoopInfo *MLI)
- : LIS(LIS), Indexes(Indexes), MDT(MDT), MLI(MLI) {}
+ MachineDominatorTree *MDT, MachineCycleInfo *MCI)
+ : LIS(LIS), Indexes(Indexes), MDT(MDT), MCI(MCI) {}
bool run(MachineFunction &MF);
void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
@@ -80,7 +80,7 @@ class SILowerSGPRSpillsLegacy : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTreeWrapperPass>();
- AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachineCycleInfoWrapperPass>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -100,7 +100,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
@@ -306,13 +306,14 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
MachineBasicBlock *
-SILowerSGPRSpills::getLoopHeaderDomBB(MachineBasicBlock *MBB) {
- // Only redirect from the loop header; other blocks in the loop must keep
- // their insertion point (e.g. latch blocks are not loop headers).
- if (MLI->isLoopHeader(MBB))
- if (MachineLoop *L = MLI->getLoopFor(MBB))
- if (MachineBasicBlock *Preheader = MLI->findLoopPreheader(L))
- return Preheader;
+SILowerSGPRSpills:: getCycleHeaderDomBB(MachineBasicBlock *MBB) {
+ // Only redirect from the cycle header; other blocks in the cycle must keep
+ // their insertion point (e.g. latch blocks are not headers).
+ MachineCycle *C = MCI->getCycle(MBB);
+ if (!C || !C->isReducible() || C->getHeader() != MBB)
+ return MBB;
+ if (MachineBasicBlock *Preheader = C->getCyclePreheader())
+ return Preheader;
return MBB;
}
@@ -358,11 +359,10 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// Find the common dominator block between PrevInsertPt and the
// current spill.
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
- // If the insertion point is a loop header move it to the preheader.
+ // If the insertion point is a cycle header move it to the preheader.
// This ensures the IMPLICIT_DEF dominates and is live-in to the header.
// WWM regalloc will then insert a wwm restore in the header.
- if (MLI->isLoopHeader(DomMBB))
- DomMBB = getLoopHeaderDomBB(DomMBB);
+ DomMBB = getCycleHeaderDomBB(DomMBB);
if (DomMBB == MBB)
I->second = InsertPt;
@@ -417,8 +417,8 @@ bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr;
MachineDominatorTree *MDT =
&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- return SILowerSGPRSpills(LIS, Indexes, MDT, MLI).run(MF);
+ MachineCycleInfo *MCI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+ return SILowerSGPRSpills(LIS, Indexes, MDT, MCI).run(MF);
}
bool SILowerSGPRSpills::run(MachineFunction &MF) {
@@ -606,7 +606,7 @@ SILowerSGPRSpillsPass::run(MachineFunction &MF,
auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
auto *Indexes = MFAM.getCachedResult<SlotIndexesAnalysis>(MF);
MachineDominatorTree *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
- MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
- SILowerSGPRSpills(LIS, Indexes, MDT, MLI).run(MF);
+ MachineCycleInfo &MCI = MFAM.getResult<MachineCycleAnalysis>(MF);
+ SILowerSGPRSpills(LIS, Indexes, MDT, &MCI).run(MF);
return PreservedAnalyses::all();
}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7c81a0659acce..3f8f5f4f2cc81 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -118,7 +118,7 @@
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
-; GCN-O0-NEXT: Machine Natural Loop Construction
+; GCN-O0-NEXT: Machine Cycle Info Analysis
; GCN-O0-NEXT: SI lower SGPR spill instructions
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
@@ -383,6 +383,7 @@
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: Stack Slot Coloring
+; GCN-O1-NEXT: Machine Cycle Info Analysis
; GCN-O1-NEXT: SI lower SGPR spill instructions
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
@@ -705,6 +706,7 @@
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
+; GCN-O1-OPTS-NEXT: Machine Cycle Info Analysis
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
@@ -1032,6 +1034,7 @@
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: Stack Slot Coloring
+; GCN-O2-NEXT: Machine Cycle Info Analysis
; GCN-O2-NEXT: SI lower SGPR spill instructions
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
@@ -1372,6 +1375,7 @@
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: Stack Slot Coloring
+; GCN-O3-NEXT: Machine Cycle Info Analysis
; GCN-O3-NEXT: SI lower SGPR spill instructions
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ef0d6a942c585..fc5dabc584863 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -18,6 +18,7 @@
; DEFAULT: Greedy Register Allocator
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: Stack Slot Coloring
+; DEFAULT-NEXT: Machine Cycle Info Analysis
; DEFAULT-NEXT: SI lower SGPR spill instructions
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
@@ -37,7 +38,7 @@
; DEFAULT-NEXT: Stack Slot Coloring
; O0: Fast Register Allocator
-; O0-NEXT: Machine Natural Loop Construction
+; O0-NEXT: Machine Cycle Info Analysis
; O0-NEXT: SI lower SGPR spill instructions
; O0-NEXT: Slot index numbering
; O0-NEXT: Live Interval Analysis
@@ -62,6 +63,7 @@
; BASIC-DEFAULT-NEXT: Basic Register Allocator
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
+; BASIC-DEFAULT-NEXT: Machine Cycle Info Analysis
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
@@ -89,6 +91,7 @@
; DEFAULT-BASIC: Greedy Register Allocator
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
+; DEFAULT-BASIC-NEXT: Machine Cycle Info Analysis
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
@@ -118,6 +121,7 @@
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: Stack Slot Coloring
+; BASIC-BASIC-NEXT: Machine Cycle Info Analysis
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
>From cfd3f33f06a4e7b6e149fd204c4482e7b3dac61c Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Mar 2026 11:33:07 -0500
Subject: [PATCH 03/12] Modify implementation to support cycles with multiple
entries.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 32 ++++++----
.../si-lower-sgpr-spills-loop-preheader.ll | 61 +++++++++++++++++++
2 files changed, 82 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index f04e20d387c58..ae360d23b9b77 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -54,7 +54,8 @@ class SILowerSGPRSpills {
MBBVector SaveBlocks;
MBBVector RestoreBlocks;
- MachineBasicBlock *getCycleHeaderDomBB(MachineBasicBlock *MBB);
+ MachineBasicBlock *getCycleHeaderDomBB(MachineBasicBlock *MBB,
+ MachineCycle *C);
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
@@ -306,16 +307,22 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
MachineBasicBlock *
-SILowerSGPRSpills:: getCycleHeaderDomBB(MachineBasicBlock *MBB) {
- // Only redirect from the cycle header; other blocks in the cycle must keep
- // their insertion point (e.g. latch blocks are not headers).
- MachineCycle *C = MCI->getCycle(MBB);
- if (!C || !C->isReducible() || C->getHeader() != MBB)
+SILowerSGPRSpills::getCycleHeaderDomBB(MachineBasicBlock *MBB,
+ MachineCycle *C) {
+ // If the insertion point lands on a cycle entry, move it to a block that
+ // dominates all entries.
+ if (C->isReducible()) {
+ if (MachineBasicBlock *Preheader = C->getCyclePreheader())
+ return Preheader;
return MBB;
- if (MachineBasicBlock *Preheader = C->getCyclePreheader())
- return Preheader;
+ }
- return MBB;
+ const SmallVectorImpl<MachineBasicBlock *> &Entries = C->getEntries();
+ assert(!Entries.empty() && "Expected cycle to have at least one entry.");
+ MachineBasicBlock *EntryBB = Entries[0];
+ for (unsigned i = 1; i < Entries.size(); ++i)
+ EntryBB = MDT->findNearestCommonDominator(EntryBB, Entries[i]);
+ return EntryBB;
}
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
@@ -362,7 +369,9 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// If the insertion point is a cycle header move it to the preheader.
// This ensures the IMPLICIT_DEF dominates and is live-in to the header.
// WWM regalloc will then insert a wwm restore in the header.
- DomMBB = getCycleHeaderDomBB(DomMBB);
+ MachineCycle *C = MCI->getCycle(DomMBB);
+ if (C && C->isEntry(DomMBB))
+ DomMBB = getCycleHeaderDomBB(DomMBB, C);
if (DomMBB == MBB)
I->second = InsertPt;
@@ -417,7 +426,8 @@ bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr;
MachineDominatorTree *MDT =
&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- MachineCycleInfo *MCI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+ MachineCycleInfo *MCI =
+ &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
return SILowerSGPRSpills(LIS, Indexes, MDT, MCI).run(MF);
}
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
new file mode 100644
index 0000000000000..396a7f1387ae5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
@@ -0,0 +1,61 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -amdgpu-spill-sgpr-to-vgpr=1 \
+; RUN: -verify-machineinstrs -stop-after=si-lower-sgpr-spills -o - %s | FileCheck %s
+
+;; Ensure that si-lower-sgpr-spills prevents IMPLICIT_DEF assignments from clobbering
+;; backedge writes by placing the assignment in the cycle preheader and not the header.
+
+; CHECK-LABEL: name: loop_sgpr_spill_implicit_def_in_preheader
+; CHECK: vgpr_32 = IMPLICIT_DEF
+; CHECK: SI_SPILL_S32_TO_VGPR killed $sgpr4, 11
+; CHECK-NEXT: S_BRANCH %bb.[[HDR:[0-9]+]]
+; CHECK: bb.[[HDR]].loop.header:
+; CHECK-NEXT: successors: {{.+}}
+; CHECK-NEXT: {{ $}}
+; CHECK-NEXT: $sgpr{{[0-9]+}} = SI_RESTORE_S32_FROM_VGPR
+
+define amdgpu_kernel void @loop_sgpr_spill_implicit_def_in_preheader(
+ ptr addrspace(1) %out) local_unnamed_addr #0 {
+entry:
+ %a0 = load i32, ptr addrspace(4) poison
+ %a1 = load i32, ptr addrspace(4) poison
+ %a2 = load i32, ptr addrspace(4) poison
+ %a3 = load i32, ptr addrspace(4) poison
+ %a4 = load i32, ptr addrspace(4) poison
+ %a5 = load i32, ptr addrspace(4) poison
+ %a6 = load i32, ptr addrspace(4) poison
+ %a7 = load i32, ptr addrspace(4) poison
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %v0 = phi i32 [ %a0, %entry ], [ %t0, %loop.latch ]
+ %v1 = phi i32 [ %a1, %entry ], [ %t1, %loop.latch ]
+ %v2 = phi i32 [ %a2, %entry ], [ %t2, %loop.latch ]
+ %v3 = phi i32 [ %a3, %entry ], [ %t3, %loop.latch ]
+ %v4 = phi i32 [ %a4, %entry ], [ %t4, %loop.latch ]
+ %v5 = phi i32 [ %a5, %entry ], [ %t5, %loop.latch ]
+ %v6 = phi i32 [ %a6, %entry ], [ %t6, %loop.latch ]
+ %v7 = phi i32 [ %a7, %entry ], [ %t7, %loop.latch ]
+ %cmp = icmp eq i32 %iv, 0
+ br i1 %cmp, label %loop.latch, label %exit
+
+loop.latch:
+ %t0 = add i32 %v0, 1
+ %t1 = add i32 %v1, 1
+ %t2 = add i32 %v2, 1
+ %t3 = add i32 %v3, 1
+ %t4 = add i32 %v4, 1
+ %t5 = add i32 %v5, 1
+ %t6 = add i32 %v6, 1
+ %t7 = add i32 %v7, 1
+ %iv.next = add nuw nsw i32 %iv, 1
+ br label %loop.header
+
+exit:
+ %sum = add i32 %v0, %v1
+ store volatile i32 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="24" "amdgpu-num-vgpr"="64" }
>From 5ea51349586a29f1fc1626af5641443f1e4b7fc3 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Mar 2026 19:13:54 -0500
Subject: [PATCH 04/12] Add multi-entry cycle test and simplify single-entry
test.
---
.../si-lower-sgpr-spills-loop-preheader.mir | 3 -
...si-lower-sgpr-spills-multi-entry-cycle.mir | 151 ++++++++++++++++++
2 files changed, 151 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
index 21d8c7762b0d7..b34ec250e566d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
@@ -28,7 +28,6 @@ body: |
; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; SGPR-SPILL-NEXT: {{ $}}
- ; SGPR-SPILL-NEXT: S_NOP 0
; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR-SPILL-NEXT: S_BRANCH %bb.1
; SGPR-SPILL-NEXT: {{ $}}
@@ -61,7 +60,6 @@ body: |
; WWM-REGALLOC-NEXT: successors: %bb.1(0x80000000)
; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; WWM-REGALLOC-NEXT: {{ $}}
- ; WWM-REGALLOC-NEXT: S_NOP 0
; WWM-REGALLOC-NEXT: renamable $vgpr63 = IMPLICIT_DEF
; WWM-REGALLOC-NEXT: S_BRANCH %bb.1
; WWM-REGALLOC-NEXT: {{ $}}
@@ -96,7 +94,6 @@ body: |
bb.0:
liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
- S_NOP 0
S_BRANCH %bb.1
bb.1:
liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
new file mode 100644
index 0000000000000..bc9756efc1ab0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
+
+# Ensure that for a multi-entry cycle si-lower-sgpr-spills inserts
+# IMPLICIT_DEF into the NCD of the basic blocks with entries
+# to the header.
+
+---
+name: sgpr_spill_multi_entry_implicit_def
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; SGPR-SPILL-LABEL: name: sgpr_spill_multi_entry_implicit_def
+ ; SGPR-SPILL: bb.0:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SGPR-SPILL-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.2
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.1:
+ ; SGPR-SPILL-NEXT: successors: %bb.3(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_NOP 0
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.3
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.2:
+ ; SGPR-SPILL-NEXT: successors: %bb.4(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_NOP 0
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.4
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.3:
+ ; SGPR-SPILL-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.5
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.4:
+ ; SGPR-SPILL-NEXT: successors: %bb.3(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: $sgpr10 = S_MOV_B32 1
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.3
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.5:
+ ; SGPR-SPILL-NEXT: liveins: $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_SETPC_B64 $sgpr30_sgpr31
+ ;
+ ; WWM-REGALLOC-LABEL: name: sgpr_spill_multi_entry_implicit_def
+ ; WWM-REGALLOC: bb.0:
+ ; WWM-REGALLOC-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; WWM-REGALLOC-NEXT: renamable $vgpr63 = IMPLICIT_DEF
+ ; WWM-REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.2
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.1:
+ ; WWM-REGALLOC-NEXT: successors: %bb.3(0x80000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_NOP 0
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.3
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.2:
+ ; WWM-REGALLOC-NEXT: successors: %bb.4(0x80000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_NOP 0
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.4
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.3:
+ ; WWM-REGALLOC-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-REGALLOC-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.5
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.4:
+ ; WWM-REGALLOC-NEXT: successors: %bb.3(0x80000000)
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: dead $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-REGALLOC-NEXT: $sgpr10 = S_MOV_B32 1
+ ; WWM-REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-REGALLOC-NEXT: S_BRANCH %bb.3
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: bb.5:
+ ; WWM-REGALLOC-NEXT: liveins: $sgpr30_sgpr31
+ ; WWM-REGALLOC-NEXT: {{ $}}
+ ; WWM-REGALLOC-NEXT: S_SETPC_B64 killed $sgpr30_sgpr31
+ bb.0:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.4
+ bb.1:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_NOP 0
+ S_BRANCH %bb.2
+ bb.4:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_NOP 0
+ S_BRANCH %bb.3
+ bb.2:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+ S_BRANCH %bb.5
+ bb.3:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ $sgpr10 = S_MOV_B32 1
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_BRANCH %bb.2
+ bb.5:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64 $sgpr30_sgpr31
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
>From 70a02f18eca87bb84c27e5b35f48fe3777d2e4d4 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 24 Mar 2026 19:32:38 -0500
Subject: [PATCH 05/12] Correct comment.
---
.../CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
index bc9756efc1ab0..73dd7637e6e25 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
@@ -3,9 +3,7 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
# Ensure that for a multi-entry cycle si-lower-sgpr-spills inserts
-# IMPLICIT_DEF into the NCD of the basic blocks with entries
-# to the header.
-
+# IMPLICIT_DEF into the NCD of the cycle's entries.
---
name: sgpr_spill_multi_entry_implicit_def
tracksRegLiveness: true
>From e7deb620c7e5803fdaa94e3641f9b061b2294708 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Thu, 26 Mar 2026 12:38:23 -0500
Subject: [PATCH 06/12] Enable moving of insertion points from any block in the
cycle.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 29 ++-
.../si-lower-sgpr-spills-cycle-header.ll | 203 ++++++++++++++++++
... => si-lower-sgpr-spills-cycle-header.mir} | 0
...wer-sgpr-spills-initial-insert-in-body.mir | 72 +++++++
...er-sgpr-spills-initial-insert-in-latch.mir | 63 ++++++
.../si-lower-sgpr-spills-loop-preheader.ll | 61 ------
.../spill-vgpr-to-agpr-update-regscavenger.ll | 5 +-
7 files changed, 364 insertions(+), 69 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.ll
rename llvm/test/CodeGen/AMDGPU/{si-lower-sgpr-spills-loop-preheader.mir => si-lower-sgpr-spills-cycle-header.mir} (100%)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
delete mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index ae360d23b9b77..72a77ff1e9512 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -338,6 +338,14 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
MBB->getParent()->getInfo<SIMachineFunctionInfo>();
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
+
+ // Sometimes an empty block is obtained. We cannot set the insertion point there.
+ auto GetNonEmptyBlockInsertionPt = [](MachineBasicBlock *Block,
+ MachineBasicBlock::iterator Fallback) {
+ auto TermIt = Block->getFirstTerminator();
+ return TermIt != Block->end() ? TermIt : Fallback;
+ };
+
Register PrevLaneVGPR;
for (auto &Spill : VGPRSpills) {
if (PrevLaneVGPR == Spill.VGPR)
@@ -347,7 +355,14 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
auto I = LaneVGPRDomInstr.find(Spill.VGPR);
if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
// Initially add the spill instruction itself for Insertion point.
- LaneVGPRDomInstr[Spill.VGPR] = InsertPt;
+ // If that point is inside a cycle, move it to a block that dominates all
+ // cycle entries (e.g. the preheader); otherwise IMPLICIT_DEF can be
+ // clobbered on backedges when the only spill sites are in the latch.
+ MachineBasicBlock *Promoted = MBB;
+ if (MachineCycle *C = MCI->getCycle(Promoted))
+ Promoted = getCycleHeaderDomBB(Promoted, C);
+ LaneVGPRDomInstr[Spill.VGPR] =
+ Promoted != MBB ? GetNonEmptyBlockInsertionPt(Promoted, InsertPt) : InsertPt;
} else {
assert(I != LaneVGPRDomInstr.end());
auto PrevInsertPt = I->second;
@@ -366,17 +381,17 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// Find the common dominator block between PrevInsertPt and the
// current spill.
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
- // If the insertion point is a cycle header move it to the preheader.
- // This ensures the IMPLICIT_DEF dominates and is live-in to the header.
- // WWM regalloc will then insert a wwm restore in the header.
- MachineCycle *C = MCI->getCycle(DomMBB);
- if (C && C->isEntry(DomMBB))
+ // If the insertion point lies anywhere inside a cycle move it to a block
+ // that dominates all entries so IMPLICIT_DEF is not clobbered
+ // on backedges. WWM regalloc will insert restores at cycle entries as
+ // needed.
+ if (MachineCycle *C = MCI->getCycle(DomMBB))
DomMBB = getCycleHeaderDomBB(DomMBB, C);
if (DomMBB == MBB)
I->second = InsertPt;
else if (DomMBB != PrevInsertPt->getParent())
- I->second = &(*DomMBB->getFirstTerminator());
+ I->second = GetNonEmptyBlockInsertionPt(DomMBB, InsertPt);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.ll b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.ll
new file mode 100644
index 0000000000000..92a6e49ae4dda
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.ll
@@ -0,0 +1,203 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -amdgpu-spill-sgpr-to-vgpr=1 \
+; RUN: -verify-machineinstrs -stop-after=si-lower-sgpr-spills -o - %s | FileCheck %s
+
+;; Ensure that si-lower-sgpr-spills prevents IMPLICIT_DEF assignments from clobbering
+;; backedge writes by placing the assignment in the cycle preheader and not the header.
+
+define amdgpu_kernel void @loop_sgpr_spill_implicit_def_in_preheader(
+ ; CHECK-LABEL: name: loop_sgpr_spill_implicit_def_in_preheader
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr8_sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 16, addrspace 4)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 1, [[DEF]], implicit killed $sgpr4_sgpr5
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 2, [[DEF]]
+ ; CHECK-NEXT: renamable $sgpr11 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr10 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr9 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr8 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr7 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr6 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr5 = COPY renamable $sgpr4
+ ; CHECK-NEXT: renamable $sgpr12 = S_MOV_B32 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 3, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr11, 4, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 5, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 6, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr8, 7, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 8, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 9, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 10, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 11, [[DEF]]
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.loop.header:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+ ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+ ; CHECK-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+ ; CHECK-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+ ; CHECK-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+ ; CHECK-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
+ ; CHECK-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 9
+ ; CHECK-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 10
+ ; CHECK-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 12, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr11, 13, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 14, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 15, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr8, 16, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 17, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 18, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 19, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 20, [[DEF]]
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 -1
+ ; CHECK-NEXT: renamable $sgpr7 = S_MOV_B32 0
+ ; CHECK-NEXT: S_CMP_LG_U32 renamable $sgpr6, killed renamable $sgpr7, implicit-def $scc
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 21, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 22, [[DEF]], implicit killed $sgpr4_sgpr5
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.loop.latch:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 20
+ ; CHECK-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 12
+ ; CHECK-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 13
+ ; CHECK-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 14
+ ; CHECK-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 15
+ ; CHECK-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 16
+ ; CHECK-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 17
+ ; CHECK-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 18
+ ; CHECK-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 19
+ ; CHECK-NEXT: renamable $sgpr5 = S_MOV_B32 1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 23, [[DEF]]
+ ; CHECK-NEXT: renamable $sgpr7 = S_ADD_I32 renamable $sgpr7, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr8 = S_ADD_I32 renamable $sgpr8, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr9 = S_ADD_I32 renamable $sgpr9, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr10 = S_ADD_I32 renamable $sgpr10, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr11 = S_ADD_I32 renamable $sgpr11, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr12 = S_ADD_I32 renamable $sgpr12, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr13 = S_ADD_I32 renamable $sgpr13, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr14 = S_ADD_I32 renamable $sgpr6, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr6 = nuw nsw S_ADD_I32 renamable $sgpr4, renamable $sgpr5, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr14, 24, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr13, 25, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 26, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr11, 27, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 28, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 29, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr8, 30, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 31, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 32, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 21, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 22, [[DEF]], implicit killed $sgpr4_sgpr5
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.Flow:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 18
+ ; CHECK-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 19
+ ; CHECK-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR [[DEF]], 21, implicit-def $vcc
+ ; CHECK-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR [[DEF]], 22
+ ; CHECK-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 24
+ ; CHECK-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 25
+ ; CHECK-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 26
+ ; CHECK-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 27
+ ; CHECK-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 28
+ ; CHECK-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 29
+ ; CHECK-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 30
+ ; CHECK-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 31
+ ; CHECK-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 32
+ ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr15 = S_MOV_B32 1
+ ; CHECK-NEXT: renamable $vcc = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], killed $sgpr15, implicit $exec
+ ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $vcc, implicit-def dead $scc
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr14, 3, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr13, 4, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 5, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr11, 6, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 7, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 8, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr8, 9, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 10, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 11, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 33, [[DEF]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 34, [[DEF]]
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.exit:
+ ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5
+ ; CHECK-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+ ; CHECK-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 34
+ ; CHECK-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 33
+ ; CHECK-NEXT: renamable $sgpr6 = S_ADD_I32 killed renamable $sgpr6, killed renamable $sgpr7, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr6
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[COPY]], killed renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (volatile store (s32) into %ir.out.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ ptr addrspace(1) %out) local_unnamed_addr #0 {
+entry:
+ %a0 = load i32, ptr addrspace(4) poison
+ %a1 = load i32, ptr addrspace(4) poison
+ %a2 = load i32, ptr addrspace(4) poison
+ %a3 = load i32, ptr addrspace(4) poison
+ %a4 = load i32, ptr addrspace(4) poison
+ %a5 = load i32, ptr addrspace(4) poison
+ %a6 = load i32, ptr addrspace(4) poison
+ %a7 = load i32, ptr addrspace(4) poison
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %v0 = phi i32 [ %a0, %entry ], [ %t0, %loop.latch ]
+ %v1 = phi i32 [ %a1, %entry ], [ %t1, %loop.latch ]
+ %v2 = phi i32 [ %a2, %entry ], [ %t2, %loop.latch ]
+ %v3 = phi i32 [ %a3, %entry ], [ %t3, %loop.latch ]
+ %v4 = phi i32 [ %a4, %entry ], [ %t4, %loop.latch ]
+ %v5 = phi i32 [ %a5, %entry ], [ %t5, %loop.latch ]
+ %v6 = phi i32 [ %a6, %entry ], [ %t6, %loop.latch ]
+ %v7 = phi i32 [ %a7, %entry ], [ %t7, %loop.latch ]
+ %cmp = icmp eq i32 %iv, 0
+ br i1 %cmp, label %loop.latch, label %exit
+
+loop.latch:
+ %t0 = add i32 %v0, 1
+ %t1 = add i32 %v1, 1
+ %t2 = add i32 %v2, 1
+ %t3 = add i32 %v3, 1
+ %t4 = add i32 %v4, 1
+ %t5 = add i32 %v5, 1
+ %t6 = add i32 %v6, 1
+ %t7 = add i32 %v7, 1
+ %iv.next = add nuw nsw i32 %iv, 1
+ br label %loop.header
+
+exit:
+ %sum = add i32 %v0, %v1
+ store volatile i32 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="24" "amdgpu-num-vgpr"="64" }
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
rename to llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
new file mode 100644
index 0000000000000..3655376703d72
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
@@ -0,0 +1,72 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+
+---
+name: sgpr_spill_initial_insert_in_body_moves_to_preheader
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; SGPR-SPILL-LABEL: name: sgpr_spill_initial_insert_in_body_moves_to_preheader
+ ; SGPR-SPILL: bb.0:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.1:
+ ; SGPR-SPILL-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; SGPR-SPILL-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.4
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.2:
+ ; SGPR-SPILL-NEXT: successors: %bb.3(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: $sgpr10 = S_MOV_B32 1
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.3
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.3:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.4:
+ ; SGPR-SPILL-NEXT: liveins: $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_SETPC_B64 $sgpr30_sgpr31
+ bb.0:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_BRANCH %bb.1
+ bb.1:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ S_BRANCH %bb.4
+ bb.2:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ $sgpr10 = S_MOV_B32 1
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_BRANCH %bb.3
+ bb.3:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_BRANCH %bb.1
+ bb.4:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64 $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
new file mode 100644
index 0000000000000..eb4f63a4e2b0c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+
+---
+name: sgpr_spill_initial_insert_in_latch_moves_to_preheader
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; SGPR-SPILL-LABEL: name: sgpr_spill_initial_insert_in_latch_moves_to_preheader
+ ; SGPR-SPILL: bb.0:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.1:
+ ; SGPR-SPILL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; SGPR-SPILL-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.3
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.2:
+ ; SGPR-SPILL-NEXT: successors: %bb.1(0x80000000)
+ ; SGPR-SPILL-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; SGPR-SPILL-NEXT: $sgpr10 = S_MOV_B32 1
+ ; SGPR-SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; SGPR-SPILL-NEXT: S_BRANCH %bb.1
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: bb.3:
+ ; SGPR-SPILL-NEXT: liveins: $sgpr30_sgpr31
+ ; SGPR-SPILL-NEXT: {{ $}}
+ ; SGPR-SPILL-NEXT: S_SETPC_B64 $sgpr30_sgpr31
+ bb.0:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_BRANCH %bb.1
+ bb.1:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ S_BRANCH %bb.3
+ bb.2:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ $sgpr10 = S_MOV_B32 1
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_BRANCH %bb.1
+ bb.3:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64 $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
deleted file mode 100644
index 396a7f1387ae5..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; REQUIRES: amdgpu-registered-target
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -amdgpu-spill-sgpr-to-vgpr=1 \
-; RUN: -verify-machineinstrs -stop-after=si-lower-sgpr-spills -o - %s | FileCheck %s
-
-;; Ensure that si-lower-sgpr-spills prevents IMPLICIT_DEF assignments from clobbering
-;; backedge writes by placing the assignment in the cycle preheader and not the header.
-
-; CHECK-LABEL: name: loop_sgpr_spill_implicit_def_in_preheader
-; CHECK: vgpr_32 = IMPLICIT_DEF
-; CHECK: SI_SPILL_S32_TO_VGPR killed $sgpr4, 11
-; CHECK-NEXT: S_BRANCH %bb.[[HDR:[0-9]+]]
-; CHECK: bb.[[HDR]].loop.header:
-; CHECK-NEXT: successors: {{.+}}
-; CHECK-NEXT: {{ $}}
-; CHECK-NEXT: $sgpr{{[0-9]+}} = SI_RESTORE_S32_FROM_VGPR
-
-define amdgpu_kernel void @loop_sgpr_spill_implicit_def_in_preheader(
- ptr addrspace(1) %out) local_unnamed_addr #0 {
-entry:
- %a0 = load i32, ptr addrspace(4) poison
- %a1 = load i32, ptr addrspace(4) poison
- %a2 = load i32, ptr addrspace(4) poison
- %a3 = load i32, ptr addrspace(4) poison
- %a4 = load i32, ptr addrspace(4) poison
- %a5 = load i32, ptr addrspace(4) poison
- %a6 = load i32, ptr addrspace(4) poison
- %a7 = load i32, ptr addrspace(4) poison
- br label %loop.header
-
-loop.header:
- %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
- %v0 = phi i32 [ %a0, %entry ], [ %t0, %loop.latch ]
- %v1 = phi i32 [ %a1, %entry ], [ %t1, %loop.latch ]
- %v2 = phi i32 [ %a2, %entry ], [ %t2, %loop.latch ]
- %v3 = phi i32 [ %a3, %entry ], [ %t3, %loop.latch ]
- %v4 = phi i32 [ %a4, %entry ], [ %t4, %loop.latch ]
- %v5 = phi i32 [ %a5, %entry ], [ %t5, %loop.latch ]
- %v6 = phi i32 [ %a6, %entry ], [ %t6, %loop.latch ]
- %v7 = phi i32 [ %a7, %entry ], [ %t7, %loop.latch ]
- %cmp = icmp eq i32 %iv, 0
- br i1 %cmp, label %loop.latch, label %exit
-
-loop.latch:
- %t0 = add i32 %v0, 1
- %t1 = add i32 %v1, 1
- %t2 = add i32 %v2, 1
- %t3 = add i32 %v3, 1
- %t4 = add i32 %v4, 1
- %t5 = add i32 %v5, 1
- %t6 = add i32 %v6, 1
- %t7 = add i32 %v7, 1
- %iv.next = add nuw nsw i32 %iv, 1
- br label %loop.header
-
-exit:
- %sum = add i32 %v0, %v1
- store volatile i32 %sum, ptr addrspace(1) %out
- ret void
-}
-
-attributes #0 = { nounwind "amdgpu-num-sgpr"="24" "amdgpu-num-vgpr"="64" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 586579fcaeb93..e43806c7e965c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -13,6 +13,7 @@ define void @test() {
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
+; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: .LBB0_1: ; %bb.1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_cbranch_scc1 .LBB0_3
@@ -20,13 +21,15 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: s_cmp_eq_u32 s6, s7
-; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
; CHECK-NEXT: s_mov_b64 s[10:11], exec
>From e1d87a56a8ded74b67edaf2b891e69e0fcf7c76c Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Thu, 26 Mar 2026 12:47:35 -0500
Subject: [PATCH 07/12] Tidy up.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 72a77ff1e9512..5aba942d0e73f 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -339,9 +339,10 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
- // Sometimes an empty block is obtained. We cannot set the insertion point there.
+ // Sometimes an empty block is obtained. We cannot set the insertion point
+ // there.
auto GetNonEmptyBlockInsertionPt = [](MachineBasicBlock *Block,
- MachineBasicBlock::iterator Fallback) {
+ MachineBasicBlock::iterator Fallback) {
auto TermIt = Block->getFirstTerminator();
return TermIt != Block->end() ? TermIt : Fallback;
};
@@ -362,7 +363,8 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
if (MachineCycle *C = MCI->getCycle(Promoted))
Promoted = getCycleHeaderDomBB(Promoted, C);
LaneVGPRDomInstr[Spill.VGPR] =
- Promoted != MBB ? GetNonEmptyBlockInsertionPt(Promoted, InsertPt) : InsertPt;
+ Promoted != MBB ? GetNonEmptyBlockInsertionPt(Promoted, InsertPt)
+ : InsertPt;
} else {
assert(I != LaneVGPRDomInstr.end());
auto PrevInsertPt = I->second;
>From 87c9ea25a87871c0938ccb3075d4cafc864609a2 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 27 Mar 2026 13:14:05 -0500
Subject: [PATCH 08/12] Account for insertion points in empty basic blocks, add
support for nested cycles and add related test.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 50 ++++++++++---------
.../AMDGPU/schedule-amdgpu-trackers.ll | 1 +
2 files changed, 28 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 5aba942d0e73f..983c11ad041a6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -66,7 +66,8 @@ class SILowerSGPRSpills {
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
void updateLaneVGPRDomInstr(
- int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+ MachineFunction &MF, int FI, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator InsertPt,
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr);
void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask);
};
@@ -117,6 +118,16 @@ static bool isLiveIntoMBB(MCRegister Reg, MachineBasicBlock &MBB,
return false;
}
+// If the insert point is MBB.end() it is not a machine instruction so
+// it cannot be dereferenced to obtain the parent BB.
+static MachineBasicBlock *
+getParentSafe(MachineFunction &MF, MachineBasicBlock::iterator InsertPt) {
+ for (MachineBasicBlock &MBB : MF)
+ if (InsertPt == MBB.end())
+ return &MBB;
+ return InsertPt->getParent();
+}
+
/// Insert spill code for the callee-saved registers used in the function.
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes,
@@ -326,7 +337,8 @@ SILowerSGPRSpills::getCycleHeaderDomBB(MachineBasicBlock *MBB,
}
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
- int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+ MachineFunction &MF, int FI, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator InsertPt,
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
// For the Def of a virtual LaneVGPR to dominate all its uses, we should
// insert an IMPLICIT_DEF before the dominating spill. Switching to a
@@ -338,15 +350,6 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
MBB->getParent()->getInfo<SIMachineFunctionInfo>();
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
-
- // Sometimes an empty block is obtained. We cannot set the insertion point
- // there.
- auto GetNonEmptyBlockInsertionPt = [](MachineBasicBlock *Block,
- MachineBasicBlock::iterator Fallback) {
- auto TermIt = Block->getFirstTerminator();
- return TermIt != Block->end() ? TermIt : Fallback;
- };
-
Register PrevLaneVGPR;
for (auto &Spill : VGPRSpills) {
if (PrevLaneVGPR == Spill.VGPR)
@@ -357,24 +360,24 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
// Initially add the spill instruction itself for Insertion point.
// If that point is inside a cycle, move it to a block that dominates all
- // cycle entries (e.g. the preheader); otherwise IMPLICIT_DEF can be
- // clobbered on backedges when the only spill sites are in the latch.
+ // cycle entries.
MachineBasicBlock *Promoted = MBB;
if (MachineCycle *C = MCI->getCycle(Promoted))
Promoted = getCycleHeaderDomBB(Promoted, C);
LaneVGPRDomInstr[Spill.VGPR] =
- Promoted != MBB ? GetNonEmptyBlockInsertionPt(Promoted, InsertPt)
- : InsertPt;
+ Promoted != MBB ? Promoted->getFirstTerminator() : InsertPt;
} else {
assert(I != LaneVGPRDomInstr.end());
auto PrevInsertPt = I->second;
- MachineBasicBlock *DomMBB = PrevInsertPt->getParent();
+ MachineBasicBlock *PrevInsertMBB = getParentSafe(MF, PrevInsertPt);
+ MachineBasicBlock *DomMBB = PrevInsertMBB;
if (DomMBB == MBB) {
// The insertion point earlier selected in a predecessor block whose
// spills are currently being lowered. The earlier InsertPt would be
// the one just before the block terminator and it should be changed
// if we insert any new spill in it.
- if (MDT->dominates(&*InsertPt, &*PrevInsertPt))
+ if (PrevInsertPt == MBB->end() ||
+ MDT->dominates(&*InsertPt, &*PrevInsertPt))
I->second = InsertPt;
continue;
@@ -387,13 +390,13 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// that dominates all entries so IMPLICIT_DEF is not clobbered
// on backedges. WWM regalloc will insert restores at cycle entries as
// needed.
- if (MachineCycle *C = MCI->getCycle(DomMBB))
+ if (MachineCycle *C = MCI->getTopLevelParentCycle(DomMBB))
DomMBB = getCycleHeaderDomBB(DomMBB, C);
if (DomMBB == MBB)
I->second = InsertPt;
- else if (DomMBB != PrevInsertPt->getParent())
- I->second = GetNonEmptyBlockInsertionPt(DomMBB, InsertPt);
+ else if (DomMBB != PrevInsertMBB)
+ I->second = DomMBB->getFirstTerminator();
}
}
}
@@ -535,7 +538,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
llvm_unreachable(
"failed to spill SGPR to virtual VGPR lane when allocated");
SpillFIs.set(FI);
- updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
+ updateLaneVGPRDomInstr(MF, FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
SpilledToVirtVGPRLanes = true;
}
}
@@ -545,10 +548,11 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
auto InsertPt = LaneVGPRDomInstr[Reg];
// Insert the IMPLICIT_DEF at the identified points.
- MachineBasicBlock &Block = *InsertPt->getParent();
+ MachineBasicBlock &Block =
+ *getParentSafe(MF, InsertPt);
DebugLoc DL = Block.findDebugLoc(InsertPt);
auto MIB =
- BuildMI(Block, *InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+ BuildMI(Block, InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg);
// Add WWM flag to the virtual register.
FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 71981e3599b87..9dbc2d3540a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s
>From df2a266903c9726e68775265dc0f6d50cef42d50 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 28 Mar 2026 09:22:20 -0500
Subject: [PATCH 09/12] Remove getParentsSafe() in favour or struct type. Use
IDom when finding cycle dominator, remobing MBB fallback.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 77 ++++++++++----------
1 file changed, 37 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 983c11ad041a6..d75bd296248e5 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -35,6 +35,18 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;
namespace {
+/// Insertion point for IMPLICIT_DEF: iterator may be MBB::end() and can't be
+/// dereferenced so the parent block is stored explicitly.
+struct LaneVGPRInsertPt {
+ MachineBasicBlock *MBB;
+ MachineBasicBlock::iterator It;
+};
+
+static LaneVGPRInsertPt insertPt(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator It) {
+ return {MBB, It};
+}
+
static cl::opt<unsigned> MaxNumVGPRsForWwmAllocation(
"amdgpu-num-vgprs-for-wwm-alloc",
cl::desc("Max num VGPRs for whole-wave register allocation."),
@@ -54,8 +66,7 @@ class SILowerSGPRSpills {
MBBVector SaveBlocks;
MBBVector RestoreBlocks;
- MachineBasicBlock *getCycleHeaderDomBB(MachineBasicBlock *MBB,
- MachineCycle *C);
+ MachineBasicBlock *getCycleHeaderDomBB(MachineCycle *C);
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
@@ -66,9 +77,8 @@ class SILowerSGPRSpills {
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
void updateLaneVGPRDomInstr(
- MachineFunction &MF, int FI, MachineBasicBlock *MBB,
- MachineBasicBlock::iterator InsertPt,
- DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr);
+ int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+ DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr);
void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask);
};
@@ -118,16 +128,6 @@ static bool isLiveIntoMBB(MCRegister Reg, MachineBasicBlock &MBB,
return false;
}
-// If the insert point is MBB.end() it is not a machine instruction so
-// it cannot be dereferenced to obtain the parent BB.
-static MachineBasicBlock *
-getParentSafe(MachineFunction &MF, MachineBasicBlock::iterator InsertPt) {
- for (MachineBasicBlock &MBB : MF)
- if (InsertPt == MBB.end())
- return &MBB;
- return InsertPt->getParent();
-}
-
/// Insert spill code for the callee-saved registers used in the function.
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes,
@@ -317,15 +317,13 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
return false;
}
-MachineBasicBlock *
-SILowerSGPRSpills::getCycleHeaderDomBB(MachineBasicBlock *MBB,
- MachineCycle *C) {
+MachineBasicBlock *SILowerSGPRSpills::getCycleHeaderDomBB(MachineCycle *C) {
// If the insertion point lands on a cycle entry, move it to a block that
// dominates all entries.
if (C->isReducible()) {
- if (MachineBasicBlock *Preheader = C->getCyclePreheader())
- return Preheader;
- return MBB;
+ if (auto *IDom = MDT->getNode(C->getHeader())->getIDom())
+ return IDom->getBlock();
+ assert(false && "Expected cycle to have an IDom.");
}
const SmallVectorImpl<MachineBasicBlock *> &Entries = C->getEntries();
@@ -337,9 +335,8 @@ SILowerSGPRSpills::getCycleHeaderDomBB(MachineBasicBlock *MBB,
}
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
- MachineFunction &MF, int FI, MachineBasicBlock *MBB,
- MachineBasicBlock::iterator InsertPt,
- DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
+ int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+ DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr) {
// For the Def of a virtual LaneVGPR to dominate all its uses, we should
// insert an IMPLICIT_DEF before the dominating spill. Switching to a
// depth first order doesn't really help since the machine function can be in
@@ -363,13 +360,15 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// cycle entries.
MachineBasicBlock *Promoted = MBB;
if (MachineCycle *C = MCI->getCycle(Promoted))
- Promoted = getCycleHeaderDomBB(Promoted, C);
+ Promoted = getCycleHeaderDomBB(C);
LaneVGPRDomInstr[Spill.VGPR] =
- Promoted != MBB ? Promoted->getFirstTerminator() : InsertPt;
+ Promoted != MBB ? insertPt(Promoted, Promoted->getFirstTerminator())
+ : insertPt(MBB, InsertPt);
} else {
assert(I != LaneVGPRDomInstr.end());
- auto PrevInsertPt = I->second;
- MachineBasicBlock *PrevInsertMBB = getParentSafe(MF, PrevInsertPt);
+ LaneVGPRInsertPt Prev = I->second;
+ MachineBasicBlock *PrevInsertMBB = Prev.MBB;
+ MachineBasicBlock::iterator PrevInsertPt = Prev.It;
MachineBasicBlock *DomMBB = PrevInsertMBB;
if (DomMBB == MBB) {
// The insertion point earlier selected in a predecessor block whose
@@ -378,7 +377,7 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// if we insert any new spill in it.
if (PrevInsertPt == MBB->end() ||
MDT->dominates(&*InsertPt, &*PrevInsertPt))
- I->second = InsertPt;
+ I->second = insertPt(MBB, InsertPt);
continue;
}
@@ -391,12 +390,12 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// on backedges. WWM regalloc will insert restores at cycle entries as
// needed.
if (MachineCycle *C = MCI->getTopLevelParentCycle(DomMBB))
- DomMBB = getCycleHeaderDomBB(DomMBB, C);
+ DomMBB = getCycleHeaderDomBB(C);
if (DomMBB == MBB)
- I->second = InsertPt;
+ I->second = insertPt(MBB, InsertPt);
else if (DomMBB != PrevInsertMBB)
- I->second = DomMBB->getFirstTerminator();
+ I->second = insertPt(DomMBB, DomMBB->getFirstTerminator());
}
}
}
@@ -492,7 +491,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
// To track the IMPLICIT_DEF insertion point for the lane vgprs.
- DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr;
+ DenseMap<Register, LaneVGPRInsertPt> LaneVGPRDomInstr;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
@@ -538,7 +537,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
llvm_unreachable(
"failed to spill SGPR to virtual VGPR lane when allocated");
SpillFIs.set(FI);
- updateLaneVGPRDomInstr(MF, FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
+ updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
SpilledToVirtVGPRLanes = true;
}
}
@@ -546,13 +545,11 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
}
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
- auto InsertPt = LaneVGPRDomInstr[Reg];
+ LaneVGPRInsertPt IP = LaneVGPRDomInstr[Reg];
// Insert the IMPLICIT_DEF at the identified points.
- MachineBasicBlock &Block =
- *getParentSafe(MF, InsertPt);
- DebugLoc DL = Block.findDebugLoc(InsertPt);
- auto MIB =
- BuildMI(Block, InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+ MachineBasicBlock &Block = *IP.MBB;
+ DebugLoc DL = Block.findDebugLoc(IP.It);
+ auto MIB = BuildMI(Block, IP.It, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg);
// Add WWM flag to the virtual register.
FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
>From ff6c638b9ef5c982338c135cce3f48609ab33eec Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sun, 29 Mar 2026 17:52:33 -0500
Subject: [PATCH 10/12] Correct getCycle() to getTopLevelParentCycle()
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index d75bd296248e5..f4ea21cf380ef 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -359,7 +359,7 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// If that point is inside a cycle, move it to a block that dominates all
// cycle entries.
MachineBasicBlock *Promoted = MBB;
- if (MachineCycle *C = MCI->getCycle(Promoted))
+ if (MachineCycle *C = MCI->getTopLevelParentCycle(Promoted))
Promoted = getCycleHeaderDomBB(C);
LaneVGPRDomInstr[Spill.VGPR] =
Promoted != MBB ? insertPt(Promoted, Promoted->getFirstTerminator())
>From 8367f966882468146a0f8e26feec239f3c966d72 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 31 Mar 2026 11:00:07 -0500
Subject: [PATCH 11/12] Relocate adjustment for cycles to a single callsite.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 40 ++++++++++----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index f4ea21cf380ef..288c39f792ee2 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -66,7 +66,8 @@ class SILowerSGPRSpills {
MBBVector SaveBlocks;
MBBVector RestoreBlocks;
- MachineBasicBlock *getCycleHeaderDomBB(MachineCycle *C);
+ MachineBasicBlock *getCycleDomBB(MachineCycle *C);
+ LaneVGPRInsertPt adjustInsertPtForCycles(LaneVGPRInsertPt IP);
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
@@ -317,7 +318,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
return false;
}
-MachineBasicBlock *SILowerSGPRSpills::getCycleHeaderDomBB(MachineCycle *C) {
+MachineBasicBlock *SILowerSGPRSpills::getCycleDomBB(MachineCycle *C) {
// If the insertion point lands on a cycle entry, move it to a block that
// dominates all entries.
if (C->isReducible()) {
@@ -329,11 +330,24 @@ MachineBasicBlock *SILowerSGPRSpills::getCycleHeaderDomBB(MachineCycle *C) {
const SmallVectorImpl<MachineBasicBlock *> &Entries = C->getEntries();
assert(!Entries.empty() && "Expected cycle to have at least one entry.");
MachineBasicBlock *EntryBB = Entries[0];
- for (unsigned i = 1; i < Entries.size(); ++i)
- EntryBB = MDT->findNearestCommonDominator(EntryBB, Entries[i]);
+ for (unsigned I = 1; I < Entries.size(); ++I)
+ EntryBB = MDT->findNearestCommonDominator(EntryBB, Entries[I]);
return EntryBB;
}
+// Ensure that an IMPLICIT_DEF insertion point within a cycle is
+// adjusted to a block that dominates all cycle entries.
+LaneVGPRInsertPt
+SILowerSGPRSpills::adjustInsertPtForCycles(LaneVGPRInsertPt IP) {
+ MachineBasicBlock *MBB = IP.MBB;
+ MachineCycle *C = MCI->getTopLevelParentCycle(MBB);
+ if (!C)
+ return IP;
+
+ MBB = getCycleDomBB(C);
+ return insertPt(MBB, MBB->getFirstTerminator());
+}
+
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr) {
@@ -355,15 +369,7 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
PrevLaneVGPR = Spill.VGPR;
auto I = LaneVGPRDomInstr.find(Spill.VGPR);
if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
- // Initially add the spill instruction itself for Insertion point.
- // If that point is inside a cycle, move it to a block that dominates all
- // cycle entries.
- MachineBasicBlock *Promoted = MBB;
- if (MachineCycle *C = MCI->getTopLevelParentCycle(Promoted))
- Promoted = getCycleHeaderDomBB(C);
- LaneVGPRDomInstr[Spill.VGPR] =
- Promoted != MBB ? insertPt(Promoted, Promoted->getFirstTerminator())
- : insertPt(MBB, InsertPt);
+ LaneVGPRDomInstr[Spill.VGPR] = insertPt(MBB, InsertPt);
} else {
assert(I != LaneVGPRDomInstr.end());
LaneVGPRInsertPt Prev = I->second;
@@ -385,12 +391,6 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
// Find the common dominator block between PrevInsertPt and the
// current spill.
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
- // If the insertion point lies anywhere inside a cycle move it to a block
- // that dominates all entries so IMPLICIT_DEF is not clobbered
- // on backedges. WWM regalloc will insert restores at cycle entries as
- // needed.
- if (MachineCycle *C = MCI->getTopLevelParentCycle(DomMBB))
- DomMBB = getCycleHeaderDomBB(C);
if (DomMBB == MBB)
I->second = insertPt(MBB, InsertPt);
@@ -545,7 +545,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
}
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
- LaneVGPRInsertPt IP = LaneVGPRDomInstr[Reg];
+ LaneVGPRInsertPt IP = adjustInsertPtForCycles(LaneVGPRDomInstr[Reg]);
// Insert the IMPLICIT_DEF at the identified points.
MachineBasicBlock &Block = *IP.MBB;
DebugLoc DL = Block.findDebugLoc(IP.It);
>From 46907650910b690bcb6f2b16d7281816f5cd6618 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 31 Mar 2026 11:13:33 -0500
Subject: [PATCH 12/12] Factor out unnecessary function.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 20 +++++---------------
1 file changed, 5 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 288c39f792ee2..02d782eda2477 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -67,7 +67,6 @@ class SILowerSGPRSpills {
MBBVector RestoreBlocks;
MachineBasicBlock *getCycleDomBB(MachineCycle *C);
- LaneVGPRInsertPt adjustInsertPtForCycles(LaneVGPRInsertPt IP);
public:
SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
@@ -335,19 +334,6 @@ MachineBasicBlock *SILowerSGPRSpills::getCycleDomBB(MachineCycle *C) {
return EntryBB;
}
-// Ensure that an IMPLICIT_DEF insertion point within a cycle is
-// adjusted to a block that dominates all cycle entries.
-LaneVGPRInsertPt
-SILowerSGPRSpills::adjustInsertPtForCycles(LaneVGPRInsertPt IP) {
- MachineBasicBlock *MBB = IP.MBB;
- MachineCycle *C = MCI->getTopLevelParentCycle(MBB);
- if (!C)
- return IP;
-
- MBB = getCycleDomBB(C);
- return insertPt(MBB, MBB->getFirstTerminator());
-}
-
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr) {
@@ -545,7 +531,11 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
}
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
- LaneVGPRInsertPt IP = adjustInsertPtForCycles(LaneVGPRDomInstr[Reg]);
+ LaneVGPRInsertPt IP = LaneVGPRDomInstr[Reg];
+ if (MachineCycle *C = MCI->getTopLevelParentCycle(IP.MBB)) {
+ MachineBasicBlock *AdjMBB = getCycleDomBB(C);
+ IP = insertPt(AdjMBB, AdjMBB->getFirstTerminator());
+ }
// Insert the IMPLICIT_DEF at the identified points.
MachineBasicBlock &Block = *IP.MBB;
DebugLoc DL = Block.findDebugLoc(IP.It);
More information about the llvm-commits
mailing list