[llvm] [AMDGPU][NFC][SIInsertWaitcnts] Remove redundant checks for invalidate instructions (PR #166139)

Tue Nov 25 04:12:49 PST 2025

https://github.com/stepthomas updated https://github.com/llvm/llvm-project/pull/166139

>From c7232f8947bb0dc9f1ac72b2ca33d65fdf2527c4 Mon Sep 17 00:00:00 2001
From: Stephen Thomas <Stephen.Thomas at amd.com>
Date: Fri, 31 Oct 2025 13:52:18 +0000
Subject: [PATCH 1/4] remove redundant checks for invalidate instructions

SIInsertWaitcnts::getVmemWaitEventType() tests for GLOBAL_INV, GLOBAL_WB, and
GLOBAL_WBINV instructions, but in each case it is used, then either a
check has already been made for these instructions, or it is known that
the instruction definitely will not be one of these.

Move the checks for these instructions out of getVmemWaitEventType() into
a new function getInvOrWBWaitEventType(), that returns an optional WaitEventType
value, and use that in the situation where it could be one of the instructions.

SIInstrInfo::isGFX12CacheInvOrWBInst() is now itself redundant and is removed.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 17 +++++++++++------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  5 -----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..1fabb3fdabf66 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -527,9 +527,10 @@ class SIInsertWaitcnts {
 #endif // NDEBUG
   }
 
-  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
-  // instruction.
-  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
+  // Return an optional WaitEventType value if Inst is a cache
+  // invalidate or WB instruction.
+  std::optional<WaitEventType>
+  getInvOrWBWaitEventType(const MachineInstr &Inst) const {
     switch (Inst.getOpcode()) {
     case AMDGPU::GLOBAL_INV:
       return VMEM_READ_ACCESS; // tracked using loadcnt
@@ -537,9 +538,13 @@ class SIInsertWaitcnts {
     case AMDGPU::GLOBAL_WBINV:
       return VMEM_WRITE_ACCESS; // tracked using storecnt
     default:
-      break;
+      return {};
     }
+  }
 
+  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
+  // instruction.
+  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     // Maps VMEM access types to their corresponding WaitEventType.
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
@@ -2265,8 +2270,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
-    if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
-      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
+    if (std::optional<WaitEventType> ET = getInvOrWBWaitEventType(Inst)) {
+      ScoreBrackets->updateByEvent(*ET, Inst);
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21f959ce..adb6002230fce 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1089,11 +1089,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
-  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
-    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
-           Opc == AMDGPU::GLOBAL_WBINV;
-  }
-
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
     return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
            Opcode == AMDGPU::V_S_LOG_F16_e64 ||

>From d3b154223132fd294516dca264589329305837c1 Mon Sep 17 00:00:00 2001
From: Stephen Thomas <Stephen.Thomas at amd.com>
Date: Mon, 3 Nov 2025 10:34:25 +0000
Subject: [PATCH 2/4] Move getInvOrWBWaitEventType() out of SIInsertWaitcnts
 class

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 29 ++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1fabb3fdabf66..c4c35849d7d9b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -224,6 +224,20 @@ VmemType getVmemType(const MachineInstr &Inst) {
   return VMEM_NOSAMPLER;
 }
 
+// Return an optional WaitEventType value if Inst is a cache invalidate
+// or WB instruction.
+std::optional<WaitEventType> getInvOrWBWaitEventType(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case AMDGPU::GLOBAL_INV:
+    return VMEM_READ_ACCESS; // tracked using loadcnt
+  case AMDGPU::GLOBAL_WB:
+  case AMDGPU::GLOBAL_WBINV:
+    return VMEM_WRITE_ACCESS; // tracked using storecnt
+  default:
+    return {};
+  }
+}
+
 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
   switch (T) {
   case LOAD_CNT:
@@ -527,21 +541,6 @@ class SIInsertWaitcnts {
 #endif // NDEBUG
   }
 
-  // Return an optional WaitEventType value if Inst is a cache
-  // invalidate or WB instruction.
-  std::optional<WaitEventType>
-  getInvOrWBWaitEventType(const MachineInstr &Inst) const {
-    switch (Inst.getOpcode()) {
-    case AMDGPU::GLOBAL_INV:
-      return VMEM_READ_ACCESS; // tracked using loadcnt
-    case AMDGPU::GLOBAL_WB:
-    case AMDGPU::GLOBAL_WBINV:
-      return VMEM_WRITE_ACCESS; // tracked using storecnt
-    default:
-      return {};
-    }
-  }
-
   // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
   // instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {

>From 2b4e83ce2539fbae2612e23ae426e7429ee5adac Mon Sep 17 00:00:00 2001
From: Stephen Thomas <Stephen.Thomas at amd.com>
Date: Mon, 3 Nov 2025 11:44:06 +0000
Subject: [PATCH 3/4] Update getVmemWaitEventType()'s description comment

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c4c35849d7d9b..4206dcb95ef5e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -542,7 +542,8 @@ class SIInsertWaitcnts {
   }
 
   // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
-  // instruction.
+  // instruction that is not an invalidate or WB instruction, which are
+  // checked for using getInvOrWBWaitEventType().
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     // Maps VMEM access types to their corresponding WaitEventType.
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {

>From 178665d3f9d5b2e7c346e2f0be8ba3264860908d Mon Sep 17 00:00:00 2001
From: Stephen Thomas <Stephen.Thomas at amd.com>
Date: Tue, 25 Nov 2025 11:55:58 +0000
Subject: [PATCH 4/4] Address review comments.

Fold the logic that checks for invalidate/writeback instructions back into
getVmemWaitEventType(), but refactor the changes in updateEventWaitcntAfter()
so it now only calls it once for FLAT instructions and the right things
will happen.

This required some refactoring of mayAccessVMEMThroughFlat() so that it
no longer checks SIInstrInfo::usesVM_CNT() (users must now check that
themselves) and in the absence of memory operands it only returns true
if the instruction loads or stores. This allows the invalidate/writeback
instructions to not be considered as VMEM accesses.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 45 +++++++++------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |  8 +---
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 23490ce1f511d..d05ee0829e83b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -224,21 +224,6 @@ VmemType getVmemType(const MachineInstr &Inst) {
   return VMEM_NOSAMPLER;
 }
 
-// Return an optional WaitEventType value if Inst is a cache invalidate
-// or WB instruction.
-std::optional<WaitEventType> getInvOrWBWaitEventType(const MachineInstr &Inst) {
-  switch (Inst.getOpcode()) {
-  // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
-  case AMDGPU::GLOBAL_INV:
-    return VMEM_READ_ACCESS; // tracked using loadcnt
-  case AMDGPU::GLOBAL_WB:
-  case AMDGPU::GLOBAL_WBINV:
-    return VMEM_WRITE_ACCESS; // tracked using storecnt
-  default:
-    return {};
-  }
-}
-
 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
   switch (T) {
   case LOAD_CNT:
@@ -546,6 +531,16 @@ class SIInsertWaitcnts {
   // instruction that is not an invalidate or WB instruction, which are
   // checked for using getInvOrWBWaitEventType().
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
+    switch (Inst.getOpcode()) {
+    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
+    case AMDGPU::GLOBAL_INV:
+      return VMEM_READ_ACCESS; // tracked using loadcnt
+    case AMDGPU::GLOBAL_WB:
+    case AMDGPU::GLOBAL_WBINV:
+      return VMEM_WRITE_ACCESS; // tracked using storecnt
+    default:
+      break;
+    }
     // Maps VMEM access types to their corresponding WaitEventType.
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
@@ -2193,7 +2188,8 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
 }
 
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
-  return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
+  return (TII->isFLAT(MI) && SIInstrInfo::usesVM_CNT(MI) &&
+          TII->mayAccessVMEMThroughFlat(MI)) ||
          (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
 }
 
@@ -2276,19 +2272,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
-    if (std::optional<WaitEventType> ET = getInvOrWBWaitEventType(Inst)) {
-      ScoreBrackets->updateByEvent(*ET, Inst);
-      return;
-    }
-
-    assert(Inst.mayLoadOrStore());
-
     int FlatASCount = 0;
 
-    if (TII->mayAccessVMEMThroughFlat(Inst)) {
-      ++FlatASCount;
-      IsVMEMAccess = true;
+    if (SIInstrInfo::usesVM_CNT(Inst)) {
       ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
+      if (TII->mayAccessVMEMThroughFlat(Inst)) {
+        ++FlatASCount;
+        IsVMEMAccess = true;
+      }
     }
 
     if (TII->mayAccessLDSThroughFlat(Inst)) {
@@ -2636,7 +2627,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
   if (SIInstrInfo::isFLAT(MI))
-    return TII->mayAccessVMEMThroughFlat(MI);
+    return SIInstrInfo::usesVM_CNT(MI) && TII->mayAccessVMEMThroughFlat(MI);
   return SIInstrInfo::isVMEM(MI);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..83e9a79b0ad84 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4427,14 +4427,10 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
 bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
   assert(isFLAT(MI));
 
-  // All flat instructions use the VMEM counter except prefetch.
-  if (!usesVM_CNT(MI))
-    return false;
-
   // If there are no memory operands then conservatively assume the flat
-  // operation may access VMEM.
+  // operation may access VMEM if it loads or stores at all.
   if (MI.memoperands_empty())
-    return true;
+    return MI.mayLoadOrStore();
 
   // See if any memory operand specifies an address space that involves VMEM.
   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces