[llvm] [AMDGPU] Fix Xcnt handling between blocks (PR #165201)

Sun Oct 26 22:15:29 PDT 2025

https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/165201

>From c79565b5b5c59750e6d4e1f91a0fd6b6602e033b Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Mon, 27 Oct 2025 10:36:18 +0530
Subject: [PATCH] [AMDGPU] Fix Xcnt handling between blocks

The compiler needs to conservatively flush the
Xcnt Counter on entry to a block in case of
pending SMEM and VMEM events.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 33 ++++++++++++++-------
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir      |  2 +-
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced010a5a..6674f6c20d0a6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
-                                 MachineInstr *OldWaitcntInstr,
-                                 bool FlushVmCnt);
+                                 MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
+                                 bool FlushXCnt);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
-                       MachineInstr *OldWaitcntInstr);
+                       MachineInstr *OldWaitcntInstr, bool FlushXCnt);
   void updateEventWaitcntAfter(MachineInstr &Inst,
                                WaitcntBrackets *ScoreBrackets);
   bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1846,7 +1846,8 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                                                  WaitcntBrackets &ScoreBrackets,
                                                  MachineInstr *OldWaitcntInstr,
-                                                 bool FlushVmCnt) {
+                                                 bool FlushVmCnt,
+                                                 bool FlushXCnt) {
   setForceEmitWaitcnt();
 
   assert(!MI.isMetaInstruction());
@@ -2101,18 +2102,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       Wait.BvhCnt = 0;
   }
 
+  // Conservatively flush the Xcnt Counter at the start of the block.
+  if (FlushXCnt) {
+    if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
+        ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+      Wait.XCnt = 0;
+  }
+
   if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
     Wait.LoadCnt = 0;
 
   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
-                         OldWaitcntInstr);
+                         OldWaitcntInstr, FlushXCnt);
 }
 
 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        MachineBasicBlock::instr_iterator It,
                                        MachineBasicBlock &Block,
                                        WaitcntBrackets &ScoreBrackets,
-                                       MachineInstr *OldWaitcntInstr) {
+                                       MachineInstr *OldWaitcntInstr,
+                                       bool FlushXCnt) {
   bool Modified = false;
 
   if (OldWaitcntInstr)
@@ -2141,7 +2150,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   }
 
   // XCnt may be already consumed by a load wait.
-  if (Wait.XCnt != ~0u) {
+  if (Wait.XCnt != ~0u && !FlushXCnt) {
     if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
       Wait.XCnt = ~0u;
 
@@ -2214,7 +2223,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
 
   auto SuccessorIt = std::next(Inst.getIterator());
   bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
-                                /*OldWaitcntInstr=*/nullptr);
+                                /*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
 
   if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
     BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2454,6 +2463,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
+  bool FirstInstInBlock = true;
 
   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
                                          E = Block.instr_end();
@@ -2475,10 +2485,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
                       isPreheaderToFlush(Block, ScoreBrackets);
+    bool FlushXCnt = FirstInstInBlock;
+    if (FirstInstInBlock)
+      FirstInstInBlock = false;
 
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
-                                          FlushVmCnt);
+                                          FlushVmCnt, FlushXCnt);
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
@@ -2567,7 +2580,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Combine or remove any redundant waitcnts at the end of the block.
   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                              OldWaitcntInstr);
+                              OldWaitcntInstr, /*FlushXcnt=*/false);
 
   LLVM_DEBUG({
     dbgs() << "*** End Block: ";
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126f19ae1..2a80de849aec7 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem_2
 tracksRegLiveness: true
@@ -970,6 +969,7 @@ body: |
   ; GCN-NEXT:   liveins: $sgpr2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0: