[llvm] [AMDGPU] Fix missing S_WAIT_XCNT with multiple pending VMEMs (PR #166779)

Thu Nov 6 06:20:03 PST 2025

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/166779

- **Tweak test case**
- **Simplify and inline applyPendingXcntGroup. NFC.**
- **Refactor with hasMixedPendingEvents. NFC.**
- **Refactor. NFC.**
- **[AMDGPU] Fix missing S_WAIT_XCNT with multiple pending VMEMs**


>From 5806f111a171214a4fbff82d2a9205543ab867ad Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Nov 2025 13:55:01 +0000
Subject: [PATCH 1/5] Tweak test case

Using different offsets for the two global loads just makes it more
obvious that the second one could fail address translation even if the
first one succeeds.
---
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index f964480dcc633..6be6edc13b813 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1088,8 +1088,8 @@ body: |
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-  ; GCN-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   liveins: $sgpr2, $vgpr2
@@ -1105,8 +1105,8 @@ body: |
     S_CBRANCH_SCC1 %bb.2, implicit $scc
   bb.1:
     liveins: $vgpr0_vgpr1, $sgpr2
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-    $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec
+    $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec
   bb.2:
     liveins: $sgpr2, $vgpr2
     $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec

>From 5b828b05b04c22eb41a316dba25500283edfbd14 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Nov 2025 13:58:35 +0000
Subject: [PATCH 2/5] Simplify and inline applyPendingXcntGroup. NFC.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..06824d202feee 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1291,19 +1291,13 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // On entry to a block with multiple predescessors, there may
   // be pending SMEM and VMEM events active at the same time.
   // In such cases, only clear one active event at a time.
-  auto applyPendingXcntGroup = [this](unsigned E) {
-    unsigned LowerBound = getScoreLB(X_CNT);
-    applyWaitcnt(X_CNT, 0);
-    PendingEvents |= (1 << E);
-    setScoreLB(X_CNT, LowerBound);
-  };
 
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
   if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
     if (hasPendingEvent(VMEM_GROUP))
-      applyPendingXcntGroup(VMEM_GROUP);
+      PendingEvents &= ~(1 << SMEM_GROUP);
     else
       applyWaitcnt(X_CNT, 0);
     return;
@@ -1315,7 +1309,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
       !hasPendingEvent(STORE_CNT)) {
     if (hasPendingEvent(SMEM_GROUP))
-      applyPendingXcntGroup(SMEM_GROUP);
+      PendingEvents &= ~(1 << VMEM_GROUP);
     else
       applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
     return;

>From 80420954851cda47ac9e149a33b913c8aff9e15e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Nov 2025 14:01:28 +0000
Subject: [PATCH 3/5] Refactor with hasMixedPendingEvents. NFC.

This is just stylistic. It allows using the identical condition in the
SMEM and VMEM cases and potentially guards against more X_CNT event
types being added in future.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 06824d202feee..eb247c0dbcdd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1296,7 +1296,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
   if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
-    if (hasPendingEvent(VMEM_GROUP))
+    if (hasMixedPendingEvents(X_CNT))
       PendingEvents &= ~(1 << SMEM_GROUP);
     else
       applyWaitcnt(X_CNT, 0);
@@ -1308,7 +1308,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
       !hasPendingEvent(STORE_CNT)) {
-    if (hasPendingEvent(SMEM_GROUP))
+    if (hasMixedPendingEvents(X_CNT))
       PendingEvents &= ~(1 << VMEM_GROUP);
     else
       applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));

>From fb6364c49b4a7f73bfff2ff6af69399014ce34ee Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Nov 2025 14:17:24 +0000
Subject: [PATCH 4/5] Refactor. NFC.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index eb247c0dbcdd3..e5b8ef7a8d333 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1296,10 +1296,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
   if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
-    if (hasMixedPendingEvents(X_CNT))
-      PendingEvents &= ~(1 << SMEM_GROUP);
-    else
+    if (!hasMixedPendingEvents(X_CNT))
       applyWaitcnt(X_CNT, 0);
+    else
+      PendingEvents &= ~(1 << SMEM_GROUP);
     return;
   }
 
@@ -1308,10 +1308,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
       !hasPendingEvent(STORE_CNT)) {
-    if (hasMixedPendingEvents(X_CNT))
-      PendingEvents &= ~(1 << VMEM_GROUP);
-    else
+    if (!hasMixedPendingEvents(X_CNT))
       applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+    else
+      PendingEvents &= ~(1 << VMEM_GROUP);
     return;
   }
 

>From e3e274ffebf5086d013b1e43b62a5f2436dea753 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Nov 2025 14:16:12 +0000
Subject: [PATCH 5/5] [AMDGPU] Fix missing S_WAIT_XCNT with multiple pending
 VMEMs

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e5b8ef7a8d333..306d59d0867cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1310,7 +1310,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
       !hasPendingEvent(STORE_CNT)) {
     if (!hasMixedPendingEvents(X_CNT))
       applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
-    else
+    else if (Wait.LoadCnt == 0)
       PendingEvents &= ~(1 << VMEM_GROUP);
     return;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 6be6edc13b813..fe16f0d44dd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1069,7 +1069,6 @@ body: |
     $sgpr0 = S_MOV_B32 $sgpr0
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: mixed_pending_events
 tracksRegLiveness: true
@@ -1098,6 +1097,7 @@ body: |
   ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
   ; GCN-NEXT:   S_WAIT_KMCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0:
     liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc