[llvm] [NFC][AMDGPU] print more info when debugging InsertWaitCnts pass (PR #144629)

Wed Jun 18 23:17:53 PDT 2025

https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/144629

>From 64fc16f2accb433f6947ec88860e58facc780a98 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 11 Jun 2025 11:27:26 +0530
Subject: [PATCH 1/4] [NFC][AMDGPU] print more info when debugging
 InsertWaitCnts pass

---
 .../lib/Target/AMDGPU/AMDGPUWaitEventType.def | 32 ++++++++
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 74 +++++++++++++------
 2 files changed, 84 insertions(+), 22 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def b/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def
new file mode 100644
index 0000000000000..271db53c2801d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An enumeration of all the event types handled by SIInsertWaitcnts.cpp
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+AMDGPU_WAIT_EVENT(VMEM_ACCESS)              // vector-memory read & write
+AMDGPU_WAIT_EVENT(VMEM_READ_ACCESS)         // vector-memory read
+AMDGPU_WAIT_EVENT(VMEM_SAMPLER_READ_ACCESS) // vector-memory SAMPLER read (gfx12+ only)
+AMDGPU_WAIT_EVENT(VMEM_BVH_READ_ACCESS)     // vector-memory BVH read (gfx12+ only)
+AMDGPU_WAIT_EVENT(VMEM_WRITE_ACCESS)        // vector-memory write that is not scratch
+AMDGPU_WAIT_EVENT(SCRATCH_WRITE_ACCESS)     // vector-memory write that may be scratch
+AMDGPU_WAIT_EVENT(LDS_ACCESS)               // lds read & write
+AMDGPU_WAIT_EVENT(GDS_ACCESS)               // gds read & write
+AMDGPU_WAIT_EVENT(SQ_MESSAGE)               // send message
+AMDGPU_WAIT_EVENT(SMEM_ACCESS)              // scalar-memory read & write
+AMDGPU_WAIT_EVENT(EXP_GPR_LOCK)             // export holding on its data src
+AMDGPU_WAIT_EVENT(GDS_GPR_LOCK)             // GDS holding on its data and addr src
+AMDGPU_WAIT_EVENT(EXP_POS_ACCESS)           // write to export position
+AMDGPU_WAIT_EVENT(EXP_PARAM_ACCESS)         // write to export parameter
+AMDGPU_WAIT_EVENT(VMW_GPR_LOCK)             // vector-memory write holding on its data src
+AMDGPU_WAIT_EVENT(EXP_LDS_ACCESS)           // read by ldsdir counting as export
+
+#undef AMDGPU_WAIT_EVENT
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ca8e3244edd15..03a2dc0302780 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -104,24 +104,17 @@ struct HardwareLimits {
   unsigned KmcntMax;     // gfx12+ only.
 };
 
+#define AMDGPU_WAIT_EVENT(Name) Name,
+
 enum WaitEventType {
-  VMEM_ACCESS,              // vector-memory read & write
-  VMEM_READ_ACCESS,         // vector-memory read
-  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
-  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
-  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
-  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
-  LDS_ACCESS,               // lds read & write
-  GDS_ACCESS,               // gds read & write
-  SQ_MESSAGE,               // send message
-  SMEM_ACCESS,              // scalar-memory read & write
-  EXP_GPR_LOCK,             // export holding on its data src
-  GDS_GPR_LOCK,             // GDS holding on its data and addr src
-  EXP_POS_ACCESS,           // write to export position
-  EXP_PARAM_ACCESS,         // write to export parameter
-  VMW_GPR_LOCK,             // vector-memory write holding on its data src
-  EXP_LDS_ACCESS,           // read by ldsdir counting as export
-  NUM_WAIT_EVENTS,
+#include "AMDGPUWaitEventType.def"
+  NUM_WAIT_EVENTS
+};
+
+#define AMDGPU_WAIT_EVENT(Name) #Name,
+
+static constexpr StringLiteral WaitEventTypeName[] = {
+#include "AMDGPUWaitEventType.def"
 };
 
 // The mapping is:
@@ -1100,6 +1093,20 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
     }
     OS << '\n';
   }
+
+  OS << "Pending Events: ";
+  if (hasPendingEvent()) {
+    ListSeparator LS;
+    for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
+      if (hasPendingEvent((WaitEventType)I)) {
+        OS << LS << WaitEventTypeName[I];
+      }
+    }
+  } else {
+    OS << "none";
+  }
+  OS << '\n';
+
   OS << '\n';
 }
 
@@ -1265,10 +1272,15 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
 
+  LLVM_DEBUG(dbgs() << "PreGFX12::applyPreexistingWaitcnt at: " << *It << "\n");
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II << "\n");
+    if (II.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "------ skipped\n");
       continue;
+    }
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
@@ -1413,10 +1425,16 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
+  LLVM_DEBUG(dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: " << *It
+                    << "\n");
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II << "\n");
+    if (II.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "------ skipped\n");
       continue;
+    }
 
     MachineInstr **UpdatableInstr;
 
@@ -2306,7 +2324,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   bool Modified = false;
 
   LLVM_DEBUG({
-    dbgs() << "*** Block" << Block.getNumber() << " ***";
+    dbgs() << "*** Block " << Block.getNumber() << ": ";
+    Block.printName(dbgs());
+    dbgs() << " ***";
     ScoreBrackets.dump();
   });
 
@@ -2437,6 +2457,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
                               OldWaitcntInstr);
 
+  LLVM_DEBUG({
+    dbgs() << "*** Block end state: " << Block.getNumber() << ": ";
+    Block.printName(dbgs());
+    ScoreBrackets.dump();
+  });
+
   return Modified;
 }
 
@@ -2699,8 +2725,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
           BlockInfo &SuccBI = SuccBII->second;
           if (!SuccBI.Incoming) {
             SuccBI.Dirty = true;
-            if (SuccBII <= BII)
+            if (SuccBII <= BII) {
+              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
               Repeat = true;
+            }
             if (!MoveBracketsToSucc) {
               MoveBracketsToSucc = &SuccBI;
             } else {
@@ -2708,8 +2736,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
             }
           } else if (SuccBI.Incoming->merge(*Brackets)) {
             SuccBI.Dirty = true;
-            if (SuccBII <= BII)
+            if (SuccBII <= BII) {
+              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
               Repeat = true;
+            }
           }
         }
         if (MoveBracketsToSucc)

>From c96c9f73509cd4f36e5607fb6e774495780afeb5 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 18 Jun 2025 11:58:00 +0530
Subject: [PATCH 2/4] minor cleanups

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 03a2dc0302780..cda843ead94cf 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2324,9 +2324,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   bool Modified = false;
 
   LLVM_DEBUG({
-    dbgs() << "*** Block " << Block.getNumber() << ": ";
+    dbgs() << "*** Begin Block: ";
     Block.printName(dbgs());
-    dbgs() << " ***";
     ScoreBrackets.dump();
   });
 
@@ -2458,7 +2457,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                               OldWaitcntInstr);
 
   LLVM_DEBUG({
-    dbgs() << "*** Block end state: " << Block.getNumber() << ": ";
+    dbgs() << "*** End Block: ";
     Block.printName(dbgs());
     ScoreBrackets.dump();
   });

>From 826fcae60a91e6e504d7b0c92e30288a7ba8b07e Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 19 Jun 2025 11:24:45 +0530
Subject: [PATCH 3/4] whitespace cleanup and a local macro

---
 .../lib/Target/AMDGPU/AMDGPUWaitEventType.def | 32 ----------
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 63 ++++++++++++-------
 2 files changed, 40 insertions(+), 55 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def b/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def
deleted file mode 100644
index 271db53c2801d..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitEventType.def
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// An enumeration of all the event types handled by SIInsertWaitcnts.cpp
-//
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-AMDGPU_WAIT_EVENT(VMEM_ACCESS)              // vector-memory read & write
-AMDGPU_WAIT_EVENT(VMEM_READ_ACCESS)         // vector-memory read
-AMDGPU_WAIT_EVENT(VMEM_SAMPLER_READ_ACCESS) // vector-memory SAMPLER read (gfx12+ only)
-AMDGPU_WAIT_EVENT(VMEM_BVH_READ_ACCESS)     // vector-memory BVH read (gfx12+ only)
-AMDGPU_WAIT_EVENT(VMEM_WRITE_ACCESS)        // vector-memory write that is not scratch
-AMDGPU_WAIT_EVENT(SCRATCH_WRITE_ACCESS)     // vector-memory write that may be scratch
-AMDGPU_WAIT_EVENT(LDS_ACCESS)               // lds read & write
-AMDGPU_WAIT_EVENT(GDS_ACCESS)               // gds read & write
-AMDGPU_WAIT_EVENT(SQ_MESSAGE)               // send message
-AMDGPU_WAIT_EVENT(SMEM_ACCESS)              // scalar-memory read & write
-AMDGPU_WAIT_EVENT(EXP_GPR_LOCK)             // export holding on its data src
-AMDGPU_WAIT_EVENT(GDS_GPR_LOCK)             // GDS holding on its data and addr src
-AMDGPU_WAIT_EVENT(EXP_POS_ACCESS)           // write to export position
-AMDGPU_WAIT_EVENT(EXP_PARAM_ACCESS)         // write to export parameter
-AMDGPU_WAIT_EVENT(VMW_GPR_LOCK)             // vector-memory write holding on its data src
-AMDGPU_WAIT_EVENT(EXP_LDS_ACCESS)           // read by ldsdir counting as export
-
-#undef AMDGPU_WAIT_EVENT
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index cda843ead94cf..f41a3dd612039 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -104,18 +104,36 @@ struct HardwareLimits {
   unsigned KmcntMax;     // gfx12+ only.
 };
 
-#define AMDGPU_WAIT_EVENT(Name) Name,
-
+#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                    \
+  DECL(VMEM_ACCESS)              /* vmem read & write */                    \
+  DECL(VMEM_READ_ACCESS)         /* vmem read */                            \
+  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */      \
+  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */          \
+  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */       \
+  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */       \
+  DECL(LDS_ACCESS)               /* lds read & write */                     \
+  DECL(GDS_ACCESS)               /* gds read & write */                     \
+  DECL(SQ_MESSAGE)               /* send message */                         \
+  DECL(SMEM_ACCESS)              /* scalar-memory read & write */           \
+  DECL(EXP_GPR_LOCK)             /* export holding on its data src */       \
+  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */ \
+  DECL(EXP_POS_ACCESS)           /* write to export position */             \
+  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */            \
+  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */   \
+  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
+
+#define AMDGPU_EVENT_ENUM(Name) Name,
 enum WaitEventType {
-#include "AMDGPUWaitEventType.def"
+  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
   NUM_WAIT_EVENTS
 };
+#undef AMDGPU_EVENT_ENUM
 
-#define AMDGPU_WAIT_EVENT(Name) #Name,
-
+#define AMDGPU_EVENT_NAME(Name) #Name,
 static constexpr StringLiteral WaitEventTypeName[] = {
-#include "AMDGPUWaitEventType.def"
+  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
 };
+#undef AMDGPU_EVENT_NAME
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
@@ -1272,13 +1290,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
 
-  LLVM_DEBUG(dbgs() << "PreGFX12::applyPreexistingWaitcnt at: " << *It << "\n");
+  LLVM_DEBUG(dbgs() << "PreGFX12::applyPreexistingWaitcnt at: " << *It);
 
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
     if (II.isMetaInstruction()) {
-      LLVM_DEBUG(dbgs() << "------ skipped\n");
+      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
       continue;
     }
 
@@ -1332,9 +1350,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
                    ? dbgs()
-                         << "applyPreexistingWaitcnt\n"
+                         << "applied pre-existing waitcnt\n"
                          << "New Instr at block end: " << *WaitcntInstr << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
+                   : dbgs() << "applied pre-existing waitcnt\n"
                             << "Old Instr: " << *It
                             << "New Instr: " << *WaitcntInstr << '\n');
   }
@@ -1348,10 +1366,10 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     Wait.StoreCnt = ~0u;
 
     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
-                   ? dbgs() << "applyPreexistingWaitcnt\n"
+                   ? dbgs() << "applied pre-existing waitcnt\n"
                             << "New Instr at block end: " << *WaitcntVsCntInstr
                             << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
+                   : dbgs() << "applied pre-existing waitcnt\n"
                             << "Old Instr: " << *It
                             << "New Instr: " << *WaitcntVsCntInstr << '\n');
   }
@@ -1425,14 +1443,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
-  LLVM_DEBUG(dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: " << *It
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: " << *It);
 
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
     if (II.isMetaInstruction()) {
-      LLVM_DEBUG(dbgs() << "------ skipped\n");
+      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
       continue;
     }
 
@@ -1504,10 +1521,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.DsCnt = ~0u;
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: "
                               << *CombinedLoadDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It << "New Instr: "
                               << *CombinedLoadDsCntInstr << '\n');
     } else {
@@ -1529,10 +1546,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.DsCnt = ~0u;
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: "
                               << *CombinedStoreDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It << "New Instr: "
                               << *CombinedStoreDsCntInstr << '\n');
     } else {
@@ -1588,10 +1605,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       setNoWait(Wait, CT);
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: " << *WaitInstrs[CT]
                               << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It
                               << "New Instr: " << *WaitInstrs[CT] << '\n');
     } else {

>From 7435a892ff9e306207c22228f96d350d7a08621e Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 19 Jun 2025 11:47:02 +0530
Subject: [PATCH 4/4] clang-format

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 34 +++++++++++----------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f41a3dd612039..f7b88bf2d5ebc 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -104,24 +104,25 @@ struct HardwareLimits {
   unsigned KmcntMax;     // gfx12+ only.
 };
 
-#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                    \
-  DECL(VMEM_ACCESS)              /* vmem read & write */                    \
-  DECL(VMEM_READ_ACCESS)         /* vmem read */                            \
-  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */      \
-  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */          \
-  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */       \
-  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */       \
-  DECL(LDS_ACCESS)               /* lds read & write */                     \
-  DECL(GDS_ACCESS)               /* gds read & write */                     \
-  DECL(SQ_MESSAGE)               /* send message */                         \
-  DECL(SMEM_ACCESS)              /* scalar-memory read & write */           \
-  DECL(EXP_GPR_LOCK)             /* export holding on its data src */       \
-  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */ \
-  DECL(EXP_POS_ACCESS)           /* write to export position */             \
-  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */            \
-  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */   \
+#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
+  DECL(VMEM_ACCESS)              /* vmem read & write */                       \
+  DECL(VMEM_READ_ACCESS)         /* vmem read */                               \
+  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \
+  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
+  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
+  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
+  DECL(LDS_ACCESS)               /* lds read & write */                        \
+  DECL(GDS_ACCESS)               /* gds read & write */                        \
+  DECL(SQ_MESSAGE)               /* send message */                            \
+  DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
+  DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
+  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
+  DECL(EXP_POS_ACCESS)           /* write to export position */                \
+  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \
+  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \
   DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
 
+// clang-format off
 #define AMDGPU_EVENT_ENUM(Name) Name,
 enum WaitEventType {
   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
@@ -134,6 +135,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
 };
 #undef AMDGPU_EVENT_NAME
+// clang-format on
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs