[llvm] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

Tue Jan 9 02:30:20 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait
instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into
separate LOADCNT, SAMPLECNT and BVHCNT counters.


---

Patch is 1.32 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77438.diff


101 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+4) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+803-217) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+1-2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+16-21) 
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+180) 
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+7) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+143-3) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+98-25) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll (+40-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll (+85-17) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll (+18-6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+129-91) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll (+90-18) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+27-11) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll (+42-42) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll (+42-42) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll (+15-15) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (+40-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+90-50) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+42-42) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+51-15) 
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+21-21) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+46-46) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+64-64) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+40-40) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+48-48) 
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+30-15) 
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+92-92) 
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+21-9) 
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll (+66-18) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+36-36) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+126-126) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+380-268) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+238-293) 
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+188-188) 
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-store.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+211-211) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll (+27-27) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll (+27-27) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll (+54-54) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll (+8-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll (+39-39) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll (+62-62) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll (+7-3) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll (+9-5) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+38-38) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll (+52-52) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll (+43-43) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+133-85) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+36-36) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll (+11-3) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll (+26-26) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll (+15-15) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f32.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+88-88) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+79-79) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+50-50) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+104-104) 
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+28-28) 
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+38-34) 
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+369-197) 
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+369-197) 
- (modified) llvm/test/CodeGen/AMDGPU/readcyclecounter.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+27-27) 
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir (+1-1) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a7d8ff0242b801..7ca7722a5cebd1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1242,7 +1242,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
       case AMDGPU::S_WAITCNT: {
         const int64_t Imm = MI.getOperand(0).getImm();
         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
-        return (Decoded.LgkmCnt == 0);
+        // DsCnt corresponds to LGKMCnt here.
+        return (Decoded.DsCnt == 0);
       }
       default:
         // SOPP instructions cannot mitigate the hazard.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..f3d38c018d2709 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1175,6 +1175,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
+  /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+  /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+  bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1cb1d32707f2d7..37890881d57544 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -57,7 +57,18 @@ namespace {
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emitted.
 
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+enum InstCounterType {
+  LOAD_CNT = 0, // VMcnt prior to gfx12.
+  DS_CNT,       // LKGMcnt prior to gfx12.
+  EXP_CNT,      //
+  STORE_CNT,    // VScnt in gfx10/gfx11.
+  NUM_NORMAL_INST_CNTS,
+  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+  BVH_CNT,                           // gfx12+ only.
+  KM_CNT,                            // gfx12+ only.
+  NUM_EXTENDED_INST_CNTS,
+  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
 } // namespace
 
 namespace llvm {
@@ -67,15 +78,23 @@ template <> struct enum_iteration_traits<InstCounterType> {
 } // namespace llvm
 
 namespace {
-auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+  return enum_seq(LOAD_CNT, MaxCounter);
+}
 
 using RegInterval = std::pair<int, int>;
 
 struct HardwareLimits {
-  unsigned VmcntMax;
+  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
   unsigned ExpcntMax;
-  unsigned LgkmcntMax;
-  unsigned VscntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
 };
 
 struct RegisterEncoding {
@@ -86,31 +105,25 @@ struct RegisterEncoding {
 };
 
 enum WaitEventType {
-  VMEM_ACCESS,          // vector-memory read & write
-  VMEM_READ_ACCESS,     // vector-memory read
-  VMEM_WRITE_ACCESS,    // vector-memory write that is not scratch
-  SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
-  LDS_ACCESS,           // lds read & write
-  GDS_ACCESS,           // gds read & write
-  SQ_MESSAGE,           // send message
-  SMEM_ACCESS,          // scalar-memory read & write
-  EXP_GPR_LOCK,         // export holding on its data src
-  GDS_GPR_LOCK,         // GDS holding on its data and addr src
-  EXP_POS_ACCESS,       // write to export position
-  EXP_PARAM_ACCESS,     // write to export parameter
-  VMW_GPR_LOCK,         // vector-memory write holding on its data src
-  EXP_LDS_ACCESS,       // read by ldsdir counting as export
+  VMEM_ACCESS,              // vector-memory read & write
+  VMEM_READ_ACCESS,         // vector-memory read
+  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
+  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
+  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
+  LDS_ACCESS,               // lds read & write
+  GDS_ACCESS,               // gds read & write
+  SQ_MESSAGE,               // send message
+  SMEM_ACCESS,              // scalar-memory read & write
+  EXP_GPR_LOCK,             // export holding on its data src
+  GDS_GPR_LOCK,             // GDS holding on its data and addr src
+  EXP_POS_ACCESS,           // write to export position
+  EXP_PARAM_ACCESS,         // write to export parameter
+  VMW_GPR_LOCK,             // vector-memory write holding on its data src
+  EXP_LDS_ACCESS,           // read by ldsdir counting as export
   NUM_WAIT_EVENTS,
 };
 
-static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
-    (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-    (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-        (1 << SQ_MESSAGE),
-    (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-        (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
-    (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
-
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
@@ -137,17 +150,33 @@ enum VmemType {
   // MIMG instructions with a sampler.
   VMEM_SAMPLER,
   // BVH instructions
-  VMEM_BVH
+  VMEM_BVH,
+  NUM_VMEM_TYPES
 };
 
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
+    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+    AMDGPU::S_WAIT_KMCNT};
+
 static bool updateVMCntOnly(const MachineInstr &Inst) {
   return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
          SIInstrInfo::isFLATScratch(Inst);
 }
 
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+  return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
 VmemType getVmemType(const MachineInstr &Inst) {
   assert(updateVMCntOnly(Inst));
-  if (!SIInstrInfo::isMIMG(Inst))
+  if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+      !SIInstrInfo::isVSAMPLE(Inst))
     return VMEM_NOSAMPLER;
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
@@ -156,25 +185,49 @@ VmemType getVmemType(const MachineInstr &Inst) {
                        : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
 }
 
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
   switch (T) {
-  case VM_CNT:
-    Wait.VmCnt = std::min(Wait.VmCnt, Count);
-    break;
+  case LOAD_CNT:
+    return Wait.LoadCnt;
   case EXP_CNT:
-    Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
-    break;
-  case LGKM_CNT:
-    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
-    break;
-  case VS_CNT:
-    Wait.VsCnt = std::min(Wait.VsCnt, Count);
-    break;
+    return Wait.ExpCnt;
+  case DS_CNT:
+    return Wait.DsCnt;
+  case STORE_CNT:
+    return Wait.StoreCnt;
+  case SAMPLE_CNT:
+    return Wait.SampleCnt;
+  case BVH_CNT:
+    return Wait.BvhCnt;
+  case KM_CNT:
+    return Wait.KmCnt;
   default:
     llvm_unreachable("bad InstCounterType");
   }
 }
 
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+  unsigned &WC = getCounterRef(Wait, T);
+  WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  return getCounterRef(Wait, T);
+}
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+  for (auto T : inst_counter_types()) {
+    if (masks[T] & (1 << E))
+      return T;
+  }
+  llvm_unreachable("event type has no associated counter");
+}
+
 // This objects maintains the current score brackets of each wait counter, and
 // a per-register scoreboard for each wait counter.
 //
@@ -185,20 +238,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
-                  RegisterEncoding Encoding)
-      : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
+  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+                  HardwareLimits Limits, RegisterEncoding Encoding,
+                  const unsigned *WaitEventMaskForInst,
+                  InstCounterType SmemAccessCounter)
+      : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
+        Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
+        SmemAccessCounter(SmemAccessCounter) {}
 
   unsigned getWaitCountMax(InstCounterType T) const {
     switch (T) {
-    case VM_CNT:
-      return Limits.VmcntMax;
-    case LGKM_CNT:
-      return Limits.LgkmcntMax;
+    case LOAD_CNT:
+      return Limits.LoadcntMax;
+    case DS_CNT:
+      return Limits.DscntMax;
     case EXP_CNT:
       return Limits.ExpcntMax;
-    case VS_CNT:
-      return Limits.VscntMax;
+    case STORE_CNT:
+      return Limits.StorecntMax;
+    case SAMPLE_CNT:
+      return Limits.SamplecntMax;
+    case BVH_CNT:
+      return Limits.BvhcntMax;
+    case KM_CNT:
+      return Limits.KmcntMax;
     default:
       break;
     }
@@ -219,20 +282,11 @@ class WaitcntBrackets {
     return getScoreUB(T) - getScoreLB(T);
   }
 
-  // Mapping from event to counter.
-  InstCounterType eventCounter(WaitEventType E) const {
-    for (auto T : inst_counter_types()) {
-      if (WaitEventMaskForInst[T] & (1 << E))
-        return T;
-    }
-    llvm_unreachable("event type has no associated counter");
-  }
-
   unsigned getRegScore(int GprNo, InstCounterType T) const {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
-    assert(T == LGKM_CNT);
+    assert(T == SmemAccessCounter);
     return SgprScores[GprNo - NUM_ALL_VGPRS];
   }
 
@@ -269,15 +323,15 @@ class WaitcntBrackets {
   }
 
   bool hasPendingFlat() const {
-    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
-             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
-            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
-             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
   }
 
   void setPendingFlat() {
-    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
-    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
   }
 
   // Return true if there might be pending writes to the specified vgpr by VMEM
@@ -293,8 +347,8 @@ class WaitcntBrackets {
   }
 
   void setNonKernelFunctionInitialState() {
-    setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
-    PendingEvents |= WaitEventMaskForInst[VS_CNT];
+    setScoreUB(STORE_CNT, getWaitCountMax(STORE_CNT));
+    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
   }
 
   void print(raw_ostream &);
@@ -331,7 +385,7 @@ class WaitcntBrackets {
       VgprUB = std::max(VgprUB, GprNo);
       VgprScores[T][GprNo] = Val;
     } else {
-      assert(T == LGKM_CNT);
+      assert(T == SmemAccessCounter);
       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     }
@@ -342,8 +396,11 @@ class WaitcntBrackets {
                    unsigned OpNo, unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
+  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
   HardwareLimits Limits = {};
   RegisterEncoding Encoding = {};
+  const unsigned *WaitEventMaskForInst;
+  InstCounterType SmemAccessCounter;
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
   unsigned PendingEvents = 0;
@@ -354,20 +411,139 @@ class WaitcntBrackets {
   int VgprUB = -1;
   int SgprUB = -1;
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 };
 
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitcntGenerator {
+protected:
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  AMDGPU::IsaVersion IV;
+  InstCounterType MaxCounter;
+
+public:
+  WaitcntGenerator() {}
+  WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : ST(ST), TII(ST->getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+  // Edits an existing sequence of wait count instructions according
+  // to an incoming Waitcnt value, which is itself updated to reflect
+  // any new wait count instructions which may need to be generated by
+  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
+  // were made.
+  //
+  // This editing will usually be merely updated operands, but it may also
+  // delete instructions if the incoming Wait value indicates they are not
+  // needed. It may also remove existing instructions for which a wait
+  // is needed if it can be determined that it is better to generate new
+  // instructions later, as can happen on gfx12.
+  virtual bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const = 0;
+
+  // Transform a soft waitcnt into a normal one.
+  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
+
+  // Generates new wait count instructions according to the  value of
+  // Wait, returning true if any new instructions were created.
+  virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+                                MachineBasicBlock::instr_iterator It,
+                                AMDGPU::Waitcnt Wait) = 0;
+
+  // Returns an array of bit masks which can be used to map values in
+  // WaitEventType to corresponding counter values in InstCounterType.
+  virtual const unsigned *getWaitEventMask() const = 0;
+
+  virtual ~WaitcntGenerator() = default;
+};
+
+class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
+public:
+  WaitcntGeneratorPreGFX12() {}
+  WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
+      : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
+            (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
+        (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+            (1 << SQ_MESSAGE),
+        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+            (1 << EXP_LDS_ACCESS),
+        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        0,
+        0,
+        0};
+
+    return WaitEventMaskForInstPreGFX12;
+  }
+};
+
+class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+public:
+  WaitcntGeneratorGFX12Plus() {}
+  WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : WaitcntGenerator(ST, MaxCounter) {}
+
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+        (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
+        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+            (1 << EXP_LDS_ACCESS),
+        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        (1 << VMEM_SAMPLER_READ_ACCESS),
+        (1 << VMEM_BVH_READ_ACCESS),
+        (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+
+    return WaitEventMaskForInstGFX12Plus;
+  }
+};
+
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  AMDGPU::IsaVersion IV;
 
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
@@ -379,6 +555,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     bool Dirty = true;
   };
 
+  InstCounterType SmemAccessCounter;
+
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
@@ -388,10 +566,20 @@ class SIInsertWaitcnts : public MachineFunctionPass {
 
   bool OptNone;
 
+  // In any given run of this pass, WCG will point to one of these two
+  // generator objects, which must have been re-initialised before use
+  // from a value made using a subtarget constructor.
+  WaitcntGeneratorPreGFX12 WCGPreGFX12;
+  WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+
+  WaitcntGenerator *WCG = nullptr;
+
   // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
+  InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+
 public:
   static char ID;
 
@@ -438,16 +626,22 @@ class SIInsertWaitcnts : public MachineFunctionPass {
 
     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
         DebugCounter::shouldExecute(ForceLgkmCounter)) {
-      ForceEmitWaitcnt[LGKM_CNT] = true;
+      ForceEmitWaitcnt[DS_CNT] = true;
+      ForceEmitWaitcnt[KM_CNT] = true;
     } else {
-      ForceEmitWaitcnt[LGKM_CNT] = false;
+      ForceEmitWaitcnt[DS_CNT] = false;
+      ForceEmitWaitcnt[KM_CNT] = false;
     }
 
     if (DebugCounter::isCounterSet(ForceVMCounter) &&
         DebugCounter::shouldExecute(ForceVMCounter)) {
-      ForceEmitWaitcnt[VM_CNT] = true;
+      ForceEmitWaitcnt[LOAD_CNT] = true;
+      ForceEmitWaitcnt[SAMPLE_CNT] = ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/77438