[llvm] [AMDGPU] Make WaitcntBrackets::simplifyWaitcnt const again (PR #173390)

Tue Dec 23 07:55:41 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

The original design was:
- WaitcntBrackets::simplifyWaitcnt(Wait) updates Wait based on the
  current state of WaitcntBrackets, removing unnecesary waits.
- WaitcntBrackets::applyWaitcnt(Wait) updates WaitBrackets based on
  Wait, updating the state by applying the specified waits.

This was changed by #164357 which started calling applyWaitcnt from
simplifyWaitcnt.

This patch restores the original design without any significant
functional changes. There is some code duplication because both
simplifyWaitcnt and applyWaitcnt need to understand how XCNT interacts
with other counters like LOADCNT and KMCNT.


---
Full diff: https://github.com/llvm/llvm-project/pull/173390.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+45-39) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll (+1) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e21583ae0876f..e927141ae0fe6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -656,11 +656,11 @@ class WaitcntBrackets {
   bool merge(const WaitcntBrackets &Other);
 
   bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-  bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
-  bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
-  void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
+  bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) const;
+  bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) const;
+  void simplifyXcnt(AMDGPU::Waitcnt &Wait) const;
 
   void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
                                AMDGPU::Waitcnt &Wait) const;
@@ -1212,7 +1212,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
 
 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
 /// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
   simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
   simplifyWaitcnt(DS_CNT, Wait.DsCnt);
@@ -1220,7 +1220,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
-  simplifyXcnt(Wait, Wait);
+  simplifyXcnt(Wait);
 }
 
 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1332,16 +1332,32 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, UB);
     PendingEvents &= ~Context->WaitEventMaskForInst[T];
   }
+
+  if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, 0);
+    else
+      PendingEvents &= ~(1 << SMEM_GROUP);
+  }
+  if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(STORE_CNT)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, Count);
+    else if (Count == 0)
+      PendingEvents &= ~(1 << VMEM_GROUP);
+  }
 }
 
-bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
+bool WaitcntBrackets::hasRedundantXCntWithKmCnt(
+    const AMDGPU::Waitcnt &Wait) const {
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
   return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
 }
 
-bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
+bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(
+    const AMDGPU::Waitcnt &Wait) const {
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
@@ -1349,27 +1365,17 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
          !hasPendingEvent(STORE_CNT);
 }
 
-void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
-                                   AMDGPU::Waitcnt &UpdateWait) {
+void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) const {
   // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
   // optimizations. On entry to a block with multiple predescessors, there may
   // be pending SMEM and VMEM events active at the same time.
   // In such cases, only clear one active event at a time.
   // TODO: Revisit xcnt optimizations for gfx1250.
-  if (hasRedundantXCntWithKmCnt(CheckWait)) {
-    if (!hasMixedPendingEvents(X_CNT)) {
-      applyWaitcnt(X_CNT, 0);
-    } else {
-      PendingEvents &= ~(1 << SMEM_GROUP);
-    }
-  } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
-    if (!hasMixedPendingEvents(X_CNT)) {
-      applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
-    } else if (CheckWait.LoadCnt == 0) {
-      PendingEvents &= ~(1 << VMEM_GROUP);
-    }
-  }
-  simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
+  if (hasRedundantXCntWithKmCnt(Wait))
+    Wait.XCnt = ~0u;
+  if (canOptimizeXCntWithLoadCnt(Wait) && Wait.XCnt >= Wait.LoadCnt)
+    Wait.XCnt = ~0u;
+  simplifyWaitcnt(X_CNT, Wait.XCnt);
 }
 
 // Where there are multiple types of event in the bracket of a counter,
@@ -1656,6 +1662,9 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       dbgs() << *It;
   });
 
+  // Accumulate waits that should not be simplified.
+  AMDGPU::Waitcnt RequiredWait;
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
     LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
@@ -1682,16 +1691,18 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
+        Wait = Wait.combined(OldWait);
+      else
+        RequiredWait = RequiredWait.combined(OldWait);
       UpdatableInstr = &CombinedLoadDsCntInstr;
     } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
       unsigned OldEnc =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
+        Wait = Wait.combined(OldWait);
+      else
+        RequiredWait = RequiredWait.combined(OldWait);
       UpdatableInstr = &CombinedStoreDsCntInstr;
     } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
       // Architectures higher than GFX10 do not have direct loads to
@@ -1704,8 +1715,9 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       unsigned OldCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
-      addWait(Wait, CT.value(), OldCnt);
+        addWait(Wait, CT.value(), OldCnt);
+      else
+        addWait(RequiredWait, CT.value(), OldCnt);
       UpdatableInstr = &WaitInstrs[CT.value()];
     }
 
@@ -1718,8 +1730,9 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     }
   }
 
-  // Save the pre combine waitcnt in order to make xcnt checks.
-  AMDGPU::Waitcnt PreCombine = Wait;
+  ScoreBrackets.simplifyWaitcnt(Wait);
+  Wait = Wait.combined(RequiredWait);
+
   if (CombinedLoadDsCntInstr) {
     // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
     // to be waited for. Otherwise, let the instruction be deleted so
@@ -1810,13 +1823,6 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   }
 
   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-    if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
-        (CT == LOAD_CNT &&
-         ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
-      // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
-      // due to taking the backedge of a block.
-      ScoreBrackets.simplifyXcnt(PreCombine, Wait);
-    }
     if (!WaitInstrs[CT])
       continue;
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
index 29aad9c5f9dc1..58a679fa8bf9d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -28,6 +28,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
 ; GCN-NEXT:    s_wait_dscnt 0x0
 ; GCN-NEXT:    flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
 ; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    s_wait_xcnt 0x0
 ; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], 4, v[0:1]
 ; GCN-NEXT:    v_add_co_u32 v2, s0, v2, 1
 ; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0

``````````

</details>


https://github.com/llvm/llvm-project/pull/173390