[llvm] [AMDGPU][SIInsertWaitCnts] Gfx12.5 - Refactor xcnt optimization (PR #164357)

Tue Nov 4 00:06:20 PST 2025

================
@@ -1287,40 +1289,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
   }
 }
 
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
-  // On entry to a block with multiple predescessors, there may
-  // be pending SMEM and VMEM events active at the same time.
-  // In such cases, only clear one active event at a time.
-  auto applyPendingXcntGroup = [this](unsigned E) {
-    unsigned LowerBound = getScoreLB(X_CNT);
-    applyWaitcnt(X_CNT, 0);
-    PendingEvents |= (1 << E);
-    setScoreLB(X_CNT, LowerBound);
-  };
-
+bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
-    if (hasPendingEvent(VMEM_GROUP))
-      applyPendingXcntGroup(VMEM_GROUP);
-    else
-      applyWaitcnt(X_CNT, 0);
-    return;
-  }
+  return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
+}
 
+bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
-  if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT)) {
-    if (hasPendingEvent(SMEM_GROUP))
-      applyPendingXcntGroup(SMEM_GROUP);
-    else
-      applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+  return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+         !hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
+}
+
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  if (hasRedundantXCntWithKmCnt(Wait)) {
+    if (hasPendingEvent(VMEM_GROUP)) {
+      // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
+      // handling.
+      PendingEvents &= ~(1 << SMEM_GROUP);
+    } else {
+      applyWaitcnt(X_CNT, 0);
+    }
     return;
   }
-
+  if (canOptimizeXCntWithLoadCnt(Wait)) {
+    // On entry to a block with multiple predescessors, there may
+    // be pending SMEM and VMEM events active at the same time.
+    // In such cases, only clear one active event at a time.
----------------
easyonaadit wrote:

Small nit: this comment seems misplaced now, maybe it could be moved to the top of the function.

https://github.com/llvm/llvm-project/pull/164357