[llvm] [AMDGPU] Fix broken XCNT Test cases (PR #160812)

Tue Sep 30 06:44:59 PDT 2025

https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/160812

>From f3389a27ff55e00cd14d433960568022b814e6cc Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 11:26:22 +0530
Subject: [PATCH 1/8] Prototye #1 The hardware inserts an implicit xcnt between
 alternating SMEM and VMEM events. If we encounter an SMEM event, we need to
 check if there are any pending VMEM events in the queue, and mark those as
 completed. And visa-versa.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 50 +++++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ae75fb529dade..adcf219675298 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -117,12 +117,12 @@ struct HardwareLimits {
   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
-  DECL(VMEM_GROUP)               /* vmem group */                              \
+  DECL(VMEM_GROUP) /**Bit:7 in Pending events */               /* vmem group */                              \
   DECL(LDS_ACCESS)               /* lds read & write */                        \
   DECL(GDS_ACCESS)               /* gds read & write */                        \
   DECL(SQ_MESSAGE)               /* send message */                            \
   DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
-  DECL(SMEM_GROUP)               /* scalar-memory group */                     \
+  DECL(SMEM_GROUP) /**Bit:12 */               /* scalar-memory group */                     \
   DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
   DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
   DECL(EXP_POS_ACCESS)           /* write to export position */                \
@@ -830,7 +830,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   RegInterval Result;
 
   MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
-  unsigned RegIdx = TRI->getHWRegIndex(MCReg);
+  unsigned RegIdx = TRI->getHWRegIndex(MCReg); // regidx = 2 iff vgpr2
 
   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
   unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -908,6 +908,16 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
   return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
 }
 
+template <size_t N>
+static unsigned getMaxVal(unsigned (&arr)[N]) {
+  unsigned max = arr[0];
+  // size_t size = sizeof(arr)/sizeof(unsigned);
+  for(size_t i = 0; i < N; i++) {
+    if(arr[i] > max) max = arr[i];
+  }
+  return max;
+}
+
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -915,7 +925,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
 
   unsigned UB = getScoreUB(T);
-  unsigned CurrScore = UB + 1;
+  unsigned CurrScore = UB + 1; // there is one more of this type of event in the queue.
   if (CurrScore == 0)
     report_fatal_error("InsertWaitcnt score wraparound");
   // PendingEvents and ScoreUB need to be update regardless if this event
@@ -1003,7 +1013,19 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
           setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
       }
     }
-  } else if (T == X_CNT) {
+  } else if (T == X_CNT) { // Here would be a good place to add check for interleaved smem vmem access. something like: 1) alternate_wait_mask = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; (2)checkflag = PendingEvents & alternate_wait_mask; (3) if(checkflag != 0) this means that we already have a pending wait event for the alternate group(ie, if we are currently checking for SMEM_GROUP, then this is telling us that we already have a pending VMEM_GROUP event, or vica versa. Since there is an implicit hardware xcnt in between them, we can remove this previous event => (3a)clear that bit in pending events (3b)update the sgpr/vgpr scores for that event.)
+    WaitEventType alt = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+    bool altEventIsActive = (PendingEvents & (1 << alt)) != 0;
+    if(altEventIsActive) {
+      // Hardware inserts an implicit xcnt between SMEM
+      // and VMEM operations. so no need to insert here.
+      // Mark the previous Operation as completed.
+      unsigned max = E == SMEM_GROUP ? getMaxVal(VgprScores[T])
+                                     : getMaxVal(SgprScores[1]);
+      // unsigned newLB = getScoreLB(T); /*getLB score of alt envent type*/
+      setScoreLB(T, max);
+      PendingEvents &= ~(1 << alt);
+    }
     for (const MachineOperand &Op : Inst.all_uses())
       setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -1211,7 +1233,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
       } else {
         // If a counter has been maxed out avoid overflow by waiting for
         // MAX(CounterType) - 1 instead.
-        unsigned NeededWait =
+        unsigned NeededWait = // we need to wait till the counter is this value. for eg: if needtowait = 0; -> means: S_WAIT_XCNT 0
             std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
         addWait(Wait, T, NeededWait);
       }
@@ -1239,8 +1261,8 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
       return;
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
-    setScoreLB(T, UB);
-    PendingEvents &= ~Context->WaitEventMaskForInst[T];
+    setScoreLB(T, UB); // LB and UB are a sort of sliding window over the number of wait events which need to be completed.
+    PendingEvents &= ~Context->WaitEventMaskForInst[T]; // mark this event as no longer pending // i think the issue is, when we apply xcnt for say SMEM, we also wipe out the pending events mask for vmem
   }
 }
 
@@ -1248,7 +1270,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) // i cant understand this? if we have kmcnt = 0, that means we are waiting till the smem load will be complete. so then we dont need the xcnt right? or maybe we apply this for now, and then it gets merged with the kmcnt later.
     return applyWaitcnt(X_CNT, 0);
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
@@ -1979,7 +2001,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
           // previous write and this write are the same type of VMEM
           // instruction, in which case they are (in some architectures)
-          // guaranteed to write their results in order anyway.
+          // guaranteed to write their results in order anyway. // I feel this would be a good point to insert the logic that hardware adds an implicit xcnt between vmem and smem. Still need to figure out how to know if the previous op was an smem or vmem?
           // Additionally check instructions where Point Sample Acceleration
           // might be applied.
           if (Op.isUse() || !updateVMCntOnly(MI) ||
@@ -2001,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
 
-        if (hasXcnt() && Op.isDef())
+        if (hasXcnt() && Op.isDef()) //here i can maybe add something like: if(MI.isVMEM && xcnt != 0)clear(xcnt) if(MI.isSMEM && xcnt != 0)clear(xcnt)
           ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
@@ -2081,7 +2103,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
 
   if (OldWaitcntInstr)
     // Try to merge the required wait with preexisting waitcnt instructions.
-    // Also erase redundant waitcnt.
+    // Also erase redundant waitcnt. // check how xcnt is being merged. maybe this is leading to some issues.
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
 
@@ -2256,7 +2278,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
-  // bracket and the destination operand scores.
+  // bracket and the destination operand scores. // add comment about if this is xcnt then we need to update the source opernads as well.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
   bool IsVMEMAccess = false;
@@ -2357,7 +2379,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   if (!hasXcnt())
     return;
 
-  if (IsVMEMAccess)
+  if (IsVMEMAccess) // this might also be a good place to add the logic for implicit xcnt inserted by the hardware. 
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
 
   if (IsSMEMAccess)

>From 4a6440e4065950fc1cff16553ab8157706b8eb01 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 11:33:36 +0530
Subject: [PATCH 2/8] Updating test case

---
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index af8b9e7c8cd82..0385cc7c32cff 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -520,6 +520,7 @@ body: |
     ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
     ; GCN-NEXT: S_WAIT_KMCNT 0
     ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
     $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -937,6 +938,7 @@ body: |
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: S_WAIT_KMCNT 0
     ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -960,6 +962,7 @@ body: |
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GCN-NEXT: S_WAIT_LOADCNT 0
     ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -981,7 +984,6 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
-    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -1002,7 +1004,6 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec

>From f1434fe7ab0a69adb904e95ccadddd62d43bb040 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 11:46:12 +0530
Subject: [PATCH 3/8] Code cleanup

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 46 ++++++++++++++-------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index adcf219675298..dc0b0694aaaa1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -117,12 +117,12 @@ struct HardwareLimits {
   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
-  DECL(VMEM_GROUP) /**Bit:7 in Pending events */               /* vmem group */                              \
+  DECL(VMEM_GROUP)               /* vmem group */                              \
   DECL(LDS_ACCESS)               /* lds read & write */                        \
   DECL(GDS_ACCESS)               /* gds read & write */                        \
   DECL(SQ_MESSAGE)               /* send message */                            \
   DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
-  DECL(SMEM_GROUP) /**Bit:12 */               /* scalar-memory group */                     \
+  DECL(SMEM_GROUP)               /* scalar-memory group */                     \
   DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
   DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
   DECL(EXP_POS_ACCESS)           /* write to export position */                \
@@ -830,7 +830,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   RegInterval Result;
 
   MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
-  unsigned RegIdx = TRI->getHWRegIndex(MCReg); // regidx = 2 iff vgpr2
+  unsigned RegIdx = TRI->getHWRegIndex(MCReg);
 
   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
   unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -911,7 +911,6 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
 template <size_t N>
 static unsigned getMaxVal(unsigned (&arr)[N]) {
   unsigned max = arr[0];
-  // size_t size = sizeof(arr)/sizeof(unsigned);
   for(size_t i = 0; i < N; i++) {
     if(arr[i] > max) max = arr[i];
   }
@@ -925,7 +924,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
 
   unsigned UB = getScoreUB(T);
-  unsigned CurrScore = UB + 1; // there is one more of this type of event in the queue.
+  unsigned CurrScore = UB + 1;
   if (CurrScore == 0)
     report_fatal_error("InsertWaitcnt score wraparound");
   // PendingEvents and ScoreUB need to be update regardless if this event
@@ -1013,16 +1012,33 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
           setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
       }
     }
-  } else if (T == X_CNT) { // Here would be a good place to add check for interleaved smem vmem access. something like: 1) alternate_wait_mask = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; (2)checkflag = PendingEvents & alternate_wait_mask; (3) if(checkflag != 0) this means that we already have a pending wait event for the alternate group(ie, if we are currently checking for SMEM_GROUP, then this is telling us that we already have a pending VMEM_GROUP event, or vica versa. Since there is an implicit hardware xcnt in between them, we can remove this previous event => (3a)clear that bit in pending events (3b)update the sgpr/vgpr scores for that event.)
+  } else if (T == X_CNT) {
+    // Here would be a good place to add check for interleaved
+    // smem vmem access. something like: 
+    // 1) alternate_wait_mask = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; 
+    // (2)checkflag = PendingEvents & alternate_wait_mask; 
+    // (3) if(checkflag != 0) this means that we already have a pending 
+    //          wait event for the alternate group(ie, if we are currently 
+    //          checking for SMEM_GROUP, then this is telling us that we 
+    //          already have a pending VMEM_GROUP event, or vica versa. 
+    //          Since there is an implicit hardware xcnt in between them, 
+    //          we can remove this previous event => 
+    //    (3a)clear that bit in pending events 
+    //    (3b)update the sgpr/vgpr scores for that event.)
     WaitEventType alt = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
     bool altEventIsActive = (PendingEvents & (1 << alt)) != 0;
     if(altEventIsActive) {
       // Hardware inserts an implicit xcnt between SMEM
       // and VMEM operations. so no need to insert here.
-      // Mark the previous Operation as completed.
+      // Mark the previous mem Operation as completed.
+
+      // LB and UB scores are a sliding window about the total
+      // number of mem operations launched, vs number of mem ops
+      // completed uptil this point.
+      // set the new lower bound as the previous upper bound
+      // to clear that event.
       unsigned max = E == SMEM_GROUP ? getMaxVal(VgprScores[T])
                                      : getMaxVal(SgprScores[1]);
-      // unsigned newLB = getScoreLB(T); /*getLB score of alt envent type*/
       setScoreLB(T, max);
       PendingEvents &= ~(1 << alt);
     }
@@ -1233,7 +1249,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
       } else {
         // If a counter has been maxed out avoid overflow by waiting for
         // MAX(CounterType) - 1 instead.
-        unsigned NeededWait = // we need to wait till the counter is this value. for eg: if needtowait = 0; -> means: S_WAIT_XCNT 0
+        unsigned NeededWait =
             std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
         addWait(Wait, T, NeededWait);
       }
@@ -1262,7 +1278,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB); // LB and UB are a sort of sliding window over the number of wait events which need to be completed.
-    PendingEvents &= ~Context->WaitEventMaskForInst[T]; // mark this event as no longer pending // i think the issue is, when we apply xcnt for say SMEM, we also wipe out the pending events mask for vmem
+    PendingEvents &= ~Context->WaitEventMaskForInst[T]; // mark this event as no longer pending
   }
 }
 
@@ -1270,7 +1286,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) // i cant understand this? if we have kmcnt = 0, that means we are waiting till the smem load will be complete. so then we dont need the xcnt right? or maybe we apply this for now, and then it gets merged with the kmcnt later.
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
     return applyWaitcnt(X_CNT, 0);
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
@@ -2001,7 +2017,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
           // previous write and this write are the same type of VMEM
           // instruction, in which case they are (in some architectures)
-          // guaranteed to write their results in order anyway. // I feel this would be a good point to insert the logic that hardware adds an implicit xcnt between vmem and smem. Still need to figure out how to know if the previous op was an smem or vmem?
+          // guaranteed to write their results in order anyway.
           // Additionally check instructions where Point Sample Acceleration
           // might be applied.
           if (Op.isUse() || !updateVMCntOnly(MI) ||
@@ -2023,7 +2039,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
 
-        if (hasXcnt() && Op.isDef()) //here i can maybe add something like: if(MI.isVMEM && xcnt != 0)clear(xcnt) if(MI.isSMEM && xcnt != 0)clear(xcnt)
+        if (hasXcnt() && Op.isDef())
           ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
@@ -2103,7 +2119,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
 
   if (OldWaitcntInstr)
     // Try to merge the required wait with preexisting waitcnt instructions.
-    // Also erase redundant waitcnt. // check how xcnt is being merged. maybe this is leading to some issues.
+    // Also erase redundant waitcnt.
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
 
@@ -2379,7 +2395,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   if (!hasXcnt())
     return;
 
-  if (IsVMEMAccess) // this might also be a good place to add the logic for implicit xcnt inserted by the hardware. 
+  if (IsVMEMAccess)
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
 
   if (IsSMEMAccess)

>From 6822ebfd557c40b172e82fa96ca43dc46afd89b6 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 11:55:46 +0530
Subject: [PATCH 4/8] Updating one more test case

---
 llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed3a8d0d..f8655a702180e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:  .LBB5_3: ; %bb4
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_wait_xcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS

>From 04e97c57107adf53c7669eb9a4cf09c29a9a3b26 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 11:59:37 +0530
Subject: [PATCH 5/8] Running clang format

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 44 +++++++++++----------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dc0b0694aaaa1..de01621d72524 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -908,11 +908,11 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
   return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
 }
 
-template <size_t N>
-static unsigned getMaxVal(unsigned (&arr)[N]) {
+template <size_t N> static unsigned getMaxVal(unsigned (&arr)[N]) {
   unsigned max = arr[0];
-  for(size_t i = 0; i < N; i++) {
-    if(arr[i] > max) max = arr[i];
+  for (size_t i = 0; i < N; i++) {
+    if (arr[i] > max)
+      max = arr[i];
   }
   return max;
 }
@@ -1014,20 +1014,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     }
   } else if (T == X_CNT) {
     // Here would be a good place to add check for interleaved
-    // smem vmem access. something like: 
-    // 1) alternate_wait_mask = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; 
-    // (2)checkflag = PendingEvents & alternate_wait_mask; 
-    // (3) if(checkflag != 0) this means that we already have a pending 
-    //          wait event for the alternate group(ie, if we are currently 
-    //          checking for SMEM_GROUP, then this is telling us that we 
-    //          already have a pending VMEM_GROUP event, or vica versa. 
-    //          Since there is an implicit hardware xcnt in between them, 
-    //          we can remove this previous event => 
-    //    (3a)clear that bit in pending events 
+    // smem vmem access. something like:
+    // 1) alternate_wait_mask = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+    // (2)checkflag = PendingEvents & alternate_wait_mask;
+    // (3) if(checkflag != 0) this means that we already have a pending
+    //          wait event for the alternate group(ie, if we are currently
+    //          checking for SMEM_GROUP, then this is telling us that we
+    //          already have a pending VMEM_GROUP event, or vica versa.
+    //          Since there is an implicit hardware xcnt in between them,
+    //          we can remove this previous event =>
+    //    (3a)clear that bit in pending events
     //    (3b)update the sgpr/vgpr scores for that event.)
     WaitEventType alt = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
     bool altEventIsActive = (PendingEvents & (1 << alt)) != 0;
-    if(altEventIsActive) {
+    if (altEventIsActive) {
       // Hardware inserts an implicit xcnt between SMEM
       // and VMEM operations. so no need to insert here.
       // Mark the previous mem Operation as completed.
@@ -1037,8 +1037,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // completed uptil this point.
       // set the new lower bound as the previous upper bound
       // to clear that event.
-      unsigned max = E == SMEM_GROUP ? getMaxVal(VgprScores[T])
-                                     : getMaxVal(SgprScores[1]);
+      unsigned max =
+          E == SMEM_GROUP ? getMaxVal(VgprScores[T]) : getMaxVal(SgprScores[1]);
       setScoreLB(T, max);
       PendingEvents &= ~(1 << alt);
     }
@@ -1277,8 +1277,11 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
       return;
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
-    setScoreLB(T, UB); // LB and UB are a sort of sliding window over the number of wait events which need to be completed.
-    PendingEvents &= ~Context->WaitEventMaskForInst[T]; // mark this event as no longer pending
+    setScoreLB(T, UB); // LB and UB are a sort of sliding window over the number
+                       // of wait events which need to be completed.
+    PendingEvents &=
+        ~Context
+             ->WaitEventMaskForInst[T]; // mark this event as no longer pending
   }
 }
 
@@ -2294,7 +2297,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
-  // bracket and the destination operand scores. // add comment about if this is xcnt then we need to update the source opernads as well.
+  // bracket and the destination operand scores. // add comment about if this is
+  // xcnt then we need to update the source opernads as well.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
   bool IsVMEMAccess = false;

>From 64511c147d661cd160d2aecb5f0348c51c74d875 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 12:06:17 +0530
Subject: [PATCH 6/8] clean up comments

---
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 0385cc7c32cff..6fe99d895c7bb 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -922,7 +922,6 @@ body: |
     $vgpr2 = V_MOV_B32_e32 1, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem
 tracksRegLiveness: true
@@ -946,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
 ---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true
@@ -970,7 +968,6 @@ body: |
     $sgpr0 = S_MOV_B32 0
 ...
 
-# TODO: Unnecessary wait before overwriting vgpr0.
 ---
 name: overwrite_vgpr_after_smem
 tracksRegLiveness: true
@@ -990,7 +987,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# TODO: Unnecessary wait before overwriting sgpr0.
 ---
 name: overwrite_sgpr_after_vmem
 tracksRegLiveness: true

>From 0d08d253c1adb1b4926ec23aa6f93e9a303f2cf7 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Fri, 26 Sep 2025 12:11:14 +0530
Subject: [PATCH 7/8] Fix some more comments

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index de01621d72524..d8d8bafe73613 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1277,11 +1277,8 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
       return;
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
-    setScoreLB(T, UB); // LB and UB are a sort of sliding window over the number
-                       // of wait events which need to be completed.
-    PendingEvents &=
-        ~Context
-             ->WaitEventMaskForInst[T]; // mark this event as no longer pending
+    setScoreLB(T, UB);
+    PendingEvents &= ~Context->WaitEventMaskForInst[T];
   }
 }
 
@@ -2297,8 +2294,9 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
-  // bracket and the destination operand scores. // add comment about if this is
-  // xcnt then we need to update the source opernads as well.
+  // bracket and the destination operand scores.
+  // For architectures with X_CNT, mark the source address operands
+  //  with the appropriate counter values.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
   bool IsVMEMAccess = false;

>From f55856b0d04137679533641a2c4f440c383cca9f Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 30 Sep 2025 18:59:51 +0530
Subject: [PATCH 8/8] Updating maxVal logic.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d8d8bafe73613..fdaa475deab27 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -908,15 +908,6 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
   return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
 }
 
-template <size_t N> static unsigned getMaxVal(unsigned (&arr)[N]) {
-  unsigned max = arr[0];
-  for (size_t i = 0; i < N; i++) {
-    if (arr[i] > max)
-      max = arr[i];
-  }
-  return max;
-}
-
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -1025,9 +1016,9 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     //          we can remove this previous event =>
     //    (3a)clear that bit in pending events
     //    (3b)update the sgpr/vgpr scores for that event.)
-    WaitEventType alt = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
-    bool altEventIsActive = (PendingEvents & (1 << alt)) != 0;
-    if (altEventIsActive) {
+    WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+    // bool OtherEventIsActive = (PendingEvents & (1 << OtherEvent)) != 0;
+    if (PendingEvents & (1 << OtherEvent)) {
       // Hardware inserts an implicit xcnt between SMEM
       // and VMEM operations. so no need to insert here.
       // Mark the previous mem Operation as completed.
@@ -1037,10 +1028,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // completed uptil this point.
       // set the new lower bound as the previous upper bound
       // to clear that event.
-      unsigned max =
-          E == SMEM_GROUP ? getMaxVal(VgprScores[T]) : getMaxVal(SgprScores[1]);
-      setScoreLB(T, max);
-      PendingEvents &= ~(1 << alt);
+      setScoreLB(T, CurrScore - 1);
+      PendingEvents &= ~(1 << OtherEvent);
     }
     for (const MachineOperand &Op : Inst.all_uses())
       setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);