[llvm] [AMDGPU] Fix buggy insertion of DEALLOC_VGPRS message (PR #178401)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 28 03:17:28 PST 2026
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/178401
We inserted the DEALLOC_VGPRS message if there were no pending scratch
stores the first time an S_ENDPGM instruction was visited. But because
this pass uses a worklist to revisit blocks until it reaches a fixed
point, it is possible that pending scratch stores are only discovered on
the second or later visit to a block. Fix this by storing a flag for
each S_ENDPGM instruction which can be updated by later visits.
>From 2fbfbafaf861ad67a315be9ed6dd206eb6e9b509 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 23 Jan 2026 21:35:19 +0000
Subject: [PATCH] [AMDGPU] Fix buggy insertion of DEALLOC_VGPRS message
We inserted the DEALLOC_VGPRS message if there were no pending scratch
stores the first time an S_ENDPGM instruction was visited. But because
this pass uses a worklist to revisit blocks until it reaches a fixed
point, it is possible that pending scratch stores are only discovered on
the second or later visit to a block. Fix this by storing a flag for
each S_ENDPGM instruction which can be updated by later visits.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 38 ++++++------
llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 64 +++++++--------------
2 files changed, 39 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 46900223fe670..37ddf7a34b605 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -531,9 +531,10 @@ class SIInsertWaitcnts {
DenseSet<MachineInstr *> CallInsts;
DenseSet<MachineInstr *> ReturnInsts;
- // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
- // message.
- DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ // Remember all S_ENDPGM instructions. The boolean flag is true if there might
+ // be outstanding stores but definitely no outstanding scratch stores, to help
+ // with insertion of DEALLOC_VGPRS messages.
+ DenseMap<MachineInstr *, bool> EndPgmInsts;
AMDGPU::HardwareLimits Limits;
@@ -2253,12 +2254,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// completed, but it is only safe to do this if there are no outstanding
// scratch stores.
else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
- if (!WCG->isOptNone() &&
- (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
- (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
- ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
- !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
- ReleaseVGPRInsts.insert(&MI);
+ EndPgmInsts[&MI] = ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+ !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
}
// Resolve vm waits before gs-done.
else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
@@ -3520,21 +3517,22 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
// (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
- if (MFI->isDynamicVGPREnabled()) {
- for (MachineInstr *MI : ReleaseVGPRInsts) {
+ if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
+ for (auto [MI, _] : EndPgmInsts) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_ALLOC_VGPR))
.addImm(0);
Modified = true;
}
- } else {
- if (!ReleaseVGPRInsts.empty() &&
- (MF.getFrameInfo().hasCalls() ||
- ST->getOccupancyWithNumVGPRs(
- TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
- /*IsDynamicVGPR=*/false) <
- AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
- for (MachineInstr *MI : ReleaseVGPRInsts) {
+ } else if (!WCG->isOptNone() &&
+ ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ (MF.getFrameInfo().hasCalls() ||
+ ST->getOccupancyWithNumVGPRs(
+ TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+ /*IsDynamicVGPR=*/false) <
+ AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ for (auto [MI, Flag] : EndPgmInsts) {
+ if (Flag) {
if (ST->requiresNopBeforeDeallocVGPRs()) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_NOP))
@@ -3550,7 +3548,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
CallInsts.clear();
ReturnInsts.clear();
- ReleaseVGPRInsts.clear();
+ EndPgmInsts.clear();
PreheadersToFlush.clear();
SLoadAddresses.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index be36c676ff89f..5f381d661feac 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -364,54 +364,32 @@ body: |
S_ENDPGM 0, implicit $vgpr97
...
-# FIXME: The scratch store in the loop may still be pending at S_ENDPGM so we
-# should not insert S_SENDMSG.
+# The scratch store in the loop may still be pending at S_ENDPGM so we should
+# not insert S_SENDMSG.
---
name: multiple_basic_blocks4
machineFunctionInfo:
isEntryFunction: true
body: |
- ; OPT-LABEL: name: multiple_basic_blocks4
- ; OPT: bb.0:
- ; OPT-NEXT: successors: %bb.1(0x80000000)
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: bb.1:
- ; OPT-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: bb.2:
- ; OPT-NEXT: successors: %bb.1(0x80000000)
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
- ; OPT-NEXT: S_BRANCH %bb.1
- ; OPT-NEXT: {{ $}}
- ; OPT-NEXT: bb.3:
- ; OPT-NEXT: S_NOP 0
- ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
- ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
- ;
- ; NOOPT-LABEL: name: multiple_basic_blocks4
- ; NOOPT: bb.0:
- ; NOOPT-NEXT: successors: %bb.1(0x80000000)
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: bb.1:
- ; NOOPT-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: bb.2:
- ; NOOPT-NEXT: successors: %bb.1(0x80000000)
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
- ; NOOPT-NEXT: S_BRANCH %bb.1
- ; NOOPT-NEXT: {{ $}}
- ; NOOPT-NEXT: bb.3:
- ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
+ ; CHECK-LABEL: name: multiple_basic_blocks4
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
bb.0:
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
More information about the llvm-commits
mailing list