[llvm] [AMDGPU] Fix buggy insertion of DEALLOC_VGPRS message (PR #178401)

Wed Jan 28 03:17:28 PST 2026

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/178401

We inserted the DEALLOC_VGPRS message if there were no pending scratch
stores the first time an S_ENDPGM instruction was visited. But because
this pass uses a worklist to revisit blocks until it reaches a fixed
point, it is possible that pending scratch stores are only discovered on
the second or later visit to a block. Fix this by storing a flag for
each S_ENDPGM instruction which can be updated by later visits.


>From 2fbfbafaf861ad67a315be9ed6dd206eb6e9b509 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 23 Jan 2026 21:35:19 +0000
Subject: [PATCH] [AMDGPU] Fix buggy insertion of DEALLOC_VGPRS message

We inserted the DEALLOC_VGPRS message if there were no pending scratch
stores the first time an S_ENDPGM instruction was visited. But because
this pass uses a worklist to revisit blocks until it reaches a fixed
point, it is possible that pending scratch stores are only discovered on
the second or later visit to a block. Fix this by storing a flag for
each S_ENDPGM instruction which can be updated by later visits.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 38 ++++++------
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir  | 64 +++++++--------------
 2 files changed, 39 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 46900223fe670..37ddf7a34b605 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -531,9 +531,10 @@ class SIInsertWaitcnts {
   DenseSet<MachineInstr *> CallInsts;
   DenseSet<MachineInstr *> ReturnInsts;
 
-  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
-  // message.
-  DenseSet<MachineInstr *> ReleaseVGPRInsts;
+  // Remember all S_ENDPGM instructions. The boolean flag is true if there might
+  // be outstanding stores but definitely no outstanding scratch stores, to help
+  // with insertion of DEALLOC_VGPRS messages.
+  DenseMap<MachineInstr *, bool> EndPgmInsts;
 
   AMDGPU::HardwareLimits Limits;
 
@@ -2253,12 +2254,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // completed, but it is only safe to do this if there are no outstanding
   // scratch stores.
   else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
-    if (!WCG->isOptNone() &&
-        (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
-         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
-          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
-          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
-      ReleaseVGPRInsts.insert(&MI);
+    EndPgmInsts[&MI] = ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+                       !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
   }
   // Resolve vm waits before gs-done.
   else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
@@ -3520,21 +3517,22 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   // (i.e. whether we're in dynamic VGPR mode or not).
   // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
   // waveslot limited kernel runs slower with the deallocation.
-  if (MFI->isDynamicVGPREnabled()) {
-    for (MachineInstr *MI : ReleaseVGPRInsts) {
+  if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
+    for (auto [MI, _] : EndPgmInsts) {
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
               TII->get(AMDGPU::S_ALLOC_VGPR))
           .addImm(0);
       Modified = true;
     }
-  } else {
-    if (!ReleaseVGPRInsts.empty() &&
-        (MF.getFrameInfo().hasCalls() ||
-         ST->getOccupancyWithNumVGPRs(
-             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
-             /*IsDynamicVGPR=*/false) <
-             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
-      for (MachineInstr *MI : ReleaseVGPRInsts) {
+  } else if (!WCG->isOptNone() &&
+             ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+             (MF.getFrameInfo().hasCalls() ||
+              ST->getOccupancyWithNumVGPRs(
+                  TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+                  /*IsDynamicVGPR=*/false) <
+                  AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+    for (auto [MI, Flag] : EndPgmInsts) {
+      if (Flag) {
         if (ST->requiresNopBeforeDeallocVGPRs()) {
           BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
                   TII->get(AMDGPU::S_NOP))
@@ -3550,7 +3548,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
   CallInsts.clear();
   ReturnInsts.clear();
-  ReleaseVGPRInsts.clear();
+  EndPgmInsts.clear();
   PreheadersToFlush.clear();
   SLoadAddresses.clear();
 
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index be36c676ff89f..5f381d661feac 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -364,54 +364,32 @@ body:             |
     S_ENDPGM 0, implicit $vgpr97
 ...
 
-# FIXME: The scratch store in the loop may still be pending at S_ENDPGM so we
-# should not insert S_SENDMSG.
+# The scratch store in the loop may still be pending at S_ENDPGM so we should
+# not insert S_SENDMSG.
 ---
 name: multiple_basic_blocks4
 machineFunctionInfo:
   isEntryFunction: true
 body: |
-  ; OPT-LABEL: name: multiple_basic_blocks4
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.1(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit $scc
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   successors: %bb.1(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-  ; OPT-NEXT:   S_BRANCH %bb.1
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.3:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
-  ;
-  ; NOOPT-LABEL: name: multiple_basic_blocks4
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.1(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.1:
-  ; NOOPT-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit $scc
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   successors: %bb.1(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-  ; NOOPT-NEXT:   S_BRANCH %bb.1
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.3:
-  ; NOOPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
+  ; CHECK-LABEL: name: multiple_basic_blocks4
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec