[llvm] [AMDGPU] Add a debug option `-amdgpu-snop-padding` for `GCNHazardRecognizer` (PR #146587)

Tue Jul 1 11:20:27 PDT 2025

https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/146587

This can help to identify if there is potential hazards.

Co-authored-by: Byrnes, Jeffrey <Jeffrey.Byrnes at amd.com>

>From dc5bb66a57f7e0ed2b255a27e9b932c4493313cd Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 1 Jul 2025 14:16:17 -0400
Subject: [PATCH] [AMDGPU] Add a debug option `-amdgpu-snop-padding` for
 `GCNHazardRecognizer`

This can help to identify if there is potential hazards.

Co-authored-by: Byrnes, Jeffrey <Jeffrey.Byrnes at amd.com>
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   8 +-
 .../CodeGen/AMDGPU/amdgpu-snop-padding.mir    | 132 ++++++++++++++++++
 2 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bc95d3f040e1d..f7f24c86948c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -44,6 +44,11 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
                      cl::desc("Fill a percentage of the latency between "
                               "neighboring MFMA with s_nops."));
 
+// This is intended for debugging purposes only.
+static cl::opt<unsigned>
+    NopPadding("amdgpu-snop-padding", cl::Hidden,
+               cl::desc("Insert a s_nop x between every isntruction"));
+
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -300,7 +305,8 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   unsigned W = PreEmitNoopsCommon(MI);
   fixHazards(MI);
   CurrCycleInstr = nullptr;
-  return W;
+  unsigned NopPad = NopPadding.getNumOccurrences() ? NopPadding : 0;
+  return std::max(W, NopPad);
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
new file mode 100644
index 0000000000000..22c913496b734
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-snop-padding=8 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN8 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-snop-padding=16 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN16 %s
+
+---
+name:            test_snop_padding
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledSGPRs: true
+body:             |
+  ; GCN8-LABEL: name: test_snop_padding
+  ; GCN8: bb.0:
+  ; GCN8-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN8-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   S_BRANCH %bb.1
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT: bb.1:
+  ; GCN8-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GCN8-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT: bb.2:
+  ; GCN8-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN8-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   S_NOP 0
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   S_BRANCH %bb.3
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT: bb.3:
+  ; GCN8-NEXT:   liveins: $sgpr10_sgpr11
+  ; GCN8-NEXT: {{  $}}
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+  ; GCN8-NEXT:   S_NOP 7
+  ; GCN8-NEXT:   SI_RETURN
+  ;
+  ; GCN16-LABEL: name: test_snop_padding
+  ; GCN16: bb.0:
+  ; GCN16-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_BRANCH %bb.1
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT: bb.1:
+  ; GCN16-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT: bb.2:
+  ; GCN16-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN16-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 0
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_BRANCH %bb.3
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT: bb.3:
+  ; GCN16-NEXT:   liveins: $sgpr10_sgpr11
+  ; GCN16-NEXT: {{  $}}
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   S_NOP 7
+  ; GCN16-NEXT:   SI_RETURN
+  bb.0:
+    liveins: $sgpr6, $sgpr10_sgpr11
+    S_BRANCH %bb.1
+  bb.1:
+    liveins: $sgpr6, $sgpr10_sgpr11
+    %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+    S_CBRANCH_EXECZ %bb.3, implicit $exec
+  bb.2:
+    liveins: $sgpr6, $sgpr10_sgpr11
+    SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+    S_NOP 0
+    renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+    %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+    S_BRANCH %bb.3
+  bb.3:
+    liveins: $sgpr10_sgpr11
+    $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
+    S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+    SI_RETURN
+...