[llvm] [AMDGPU][RegAlloc] Correct insertion of IMPLICIT_DEF in loop headers (PR #186348)
Chris Jackson via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 04:54:56 PDT 2026
https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/186348
>From b4cca6d80bd66d6e4e6a74834f87a61bfa7bcd71 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 13 Mar 2026 04:22:48 -0500
Subject: [PATCH 1/2] [AMDGPU][RegAlloc] Correct insertion of IMPLICIT_DEF in
loop headers
si-lower-sgpr spills was observed inserting IMPLICIT_DEF for lane VGPR
restores in the loop header. The virtual VGPR is therefore not live-in
to the header and wwm regallocfast does not insert a restore. This
results in the vgpr being clobbered after each backedge.
Correct this by inserting the IMPLICIT_DEF in the preheader.
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 26 +++-
.../si-lower-sgpr-spills-loop-preheader.mir | 117 ++++++++++++++++++
.../AMDGPU/spill-sgpr-to-virtual-vgpr.mir | 6 +-
3 files changed, 143 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 196e551932659..8856398051929 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -273,7 +273,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
if (SavedRegs.test(Reg)) {
const TargetRegisterClass *RC =
- TRI->getMinimalPhysRegClass(Reg, MVT::i32);
+ TRI->getMinimalPhysRegClass(Reg, MVT::i32);
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
TRI->getSpillAlign(*RC), true);
@@ -345,6 +345,26 @@ void SILowerSGPRSpills::updateLaneVGPRDomInstr(
else if (DomMBB != PrevInsertPt->getParent())
I->second = &(*DomMBB->getFirstTerminator());
}
+ // The insertion point should not be placed in MBB if it is a loop header.
+ // This results in in the vgpr being clobbered after every backedge. Instead
+ // insert into the preheader. Then the virtual register is live-in to the
+ // header and wwm-regalloc will insert a wwm restore in the header. Could
+ // use Machine Loop Analysis to locate the preheader but that extra
+ // dependency is unnecessary.
+ auto &BlockInsertPt = LaneVGPRDomInstr[Spill.VGPR];
+ MachineBasicBlock *FinalMBB = BlockInsertPt->getParent();
+ if (FinalMBB == MBB)
+ continue;
+ // If the header dominates a predecessor, then we have a backedge.
+ // The preheader is the immediate dominator of the header.
+ for (MachineBasicBlock *Pred : FinalMBB->predecessors()) {
+ if (MDT->dominates(FinalMBB, Pred)) {
+ if (auto *IDomNode = MDT->getNode(FinalMBB)->getIDom()) {
+ BlockInsertPt = IDomNode->getBlock()->getFirstTerminator();
+ }
+ break;
+ }
+ }
}
}
@@ -424,8 +444,8 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
- const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
- (HasCSRs || FuncInfo->hasSpilledSGPRs());
+ const bool HasSGPRSpillToVGPR =
+ TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs());
if (HasSGPRSpillToVGPR) {
// Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
// are spilled to VGPRs, in which case we can eliminate the stack usage.
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
new file mode 100644
index 0000000000000..cb9411b615a96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
@@ -0,0 +1,117 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+# When SGPR spills to a virtual VGPR lane occur in both a loop header and the latch,
+# the IMPLICIT_DEF for the lane VGPR must be placed in the preheader (not the header).
+# Establish that the virtual VGPR is live-in to the header and wwm regallocfast inserts
+# a restore, preserving the latch writes.
+#
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM %s
+
+---
+name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; GCN-LABEL: name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; GCN-NEXT: S_BRANCH %bb.3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 1
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: liveins: $sgpr30_sgpr31
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
+ ;
+ ; WWM-LABEL: name: sgpr_spill_loop_header_and_latch_implicit_def_in_preheader
+ ; WWM: bb.0:
+ ; WWM-NEXT: successors: %bb.1(0x80000000)
+ ; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: S_NOP 0
+ ; WWM-NEXT: renamable $vgpr63 = IMPLICIT_DEF
+ ; WWM-NEXT: S_BRANCH %bb.1
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: bb.1:
+ ; WWM-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ ; WWM-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ ; WWM-NEXT: S_BRANCH %bb.3
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: bb.2:
+ ; WWM-NEXT: successors: %bb.1(0x80000000)
+ ; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-NEXT: dead $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-NEXT: $sgpr10 = S_MOV_B32 1
+ ; WWM-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
+ ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-NEXT: S_BRANCH %bb.1
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: bb.3:
+ ; WWM-NEXT: liveins: $sgpr30_sgpr31
+ ; WWM-NEXT: {{ $}}
+ ; WWM-NEXT: S_SETPC_B64 killed $sgpr30_sgpr31
+ ; Preheader (bb.0):
+
+
+ bb.0:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ S_NOP 0
+ S_BRANCH %bb.1
+ bb.1:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ S_BRANCH %bb.3
+ bb.2:
+ liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
+ renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ $sgpr10 = S_MOV_B32 1
+ SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_BRANCH %bb.1
+ bb.3:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64 $sgpr30_sgpr31
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index fa3fd3bc6da5b..bb47603647733 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -241,13 +241,14 @@ body: |
; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
; GCN-NEXT: S_BRANCH %bb.2
; GCN-NEXT: {{ $}}
@@ -255,7 +256,7 @@ body: |
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
+ ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
@@ -264,7 +265,6 @@ body: |
; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr10 = S_MOV_B32 10
- ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
>From 738b5b04b3f119e7a8130d6e7832e0764bb6174f Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 13 Mar 2026 06:54:44 -0500
Subject: [PATCH 2/2] Replace harcoded vgpr with regex capture
---
.../si-lower-sgpr-spills-loop-preheader.mir | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
index cb9411b615a96..b392290a16451 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-loop-preheader.mir
@@ -61,18 +61,18 @@ body: |
; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; WWM-NEXT: {{ $}}
; WWM-NEXT: S_NOP 0
- ; WWM-NEXT: renamable $vgpr63 = IMPLICIT_DEF
+ ; WWM-NEXT: renamable [[LANE_VGPR:\$vgpr[0-9]+]] = IMPLICIT_DEF
; WWM-NEXT: S_BRANCH %bb.1
; WWM-NEXT: {{ $}}
; WWM-NEXT: bb.1:
; WWM-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; WWM-NEXT: {{ $}}
- ; WWM-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
- ; WWM-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-NEXT: [[LANE_VGPR]] = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[LANE_VGPR]], 0
; WWM-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
- ; WWM-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
- ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-NEXT: [[LANE_VGPR]] = SI_SPILL_S32_TO_VGPR $sgpr10, 0, [[LANE_VGPR]]
+ ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed [[LANE_VGPR]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; WWM-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
; WWM-NEXT: S_BRANCH %bb.3
; WWM-NEXT: {{ $}}
@@ -80,11 +80,11 @@ body: |
; WWM-NEXT: successors: %bb.1(0x80000000)
; WWM-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
; WWM-NEXT: {{ $}}
- ; WWM-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
- ; WWM-NEXT: dead $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ ; WWM-NEXT: [[LANE_VGPR]] = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; WWM-NEXT: dead $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[LANE_VGPR]], 0
; WWM-NEXT: $sgpr10 = S_MOV_B32 1
- ; WWM-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, $vgpr63
- ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; WWM-NEXT: [[LANE_VGPR]] = SI_SPILL_S32_TO_VGPR $sgpr10, 0, [[LANE_VGPR]]
+ ; WWM-NEXT: SI_SPILL_WWM_V32_SAVE killed [[LANE_VGPR]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; WWM-NEXT: S_BRANCH %bb.1
; WWM-NEXT: {{ $}}
; WWM-NEXT: bb.3:
More information about the llvm-commits
mailing list