[llvm] [AMDGPU] Add a debug option `-amdgpu-snop-padding` for `GCNHazardRecognizer` (PR #146587)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 2 05:07:02 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/146587
>From 975e492867430ebcf83d136eea9161f35ab88a6f Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 1 Jul 2025 14:16:17 -0400
Subject: [PATCH 1/3] [AMDGPU] Add a debug option `-amdgpu-snop-padding` for
`GCNHazardRecognizer`
This can help to identify if there is potential hazards.
Co-authored-by: Byrnes, Jeffrey <Jeffrey.Byrnes at amd.com>
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 8 +-
.../CodeGen/AMDGPU/amdgpu-snop-padding.mir | 132 ++++++++++++++++++
2 files changed, 139 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bc95d3f040e1d..f7f24c86948c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -44,6 +44,11 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
cl::desc("Fill a percentage of the latency between "
"neighboring MFMA with s_nops."));
+// This is intended for debugging purposes only.
+static cl::opt<unsigned>
+ NopPadding("amdgpu-snop-padding", cl::Hidden,
+ cl::desc("Insert a s_nop x between every isntruction"));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -300,7 +305,8 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
unsigned W = PreEmitNoopsCommon(MI);
fixHazards(MI);
CurrCycleInstr = nullptr;
- return W;
+ unsigned NopPad = NopPadding.getNumOccurrences() ? NopPadding : 0;
+ return std::max(W, NopPad);
}
unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
new file mode 100644
index 0000000000000..22c913496b734
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-snop-padding=8 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN8 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-snop-padding=16 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN16 %s
+
+---
+name: test_snop_padding
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+ hasSpilledSGPRs: true
+body: |
+ ; GCN8-LABEL: name: test_snop_padding
+ ; GCN8: bb.0:
+ ; GCN8-NEXT: successors: %bb.1(0x80000000)
+ ; GCN8-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: S_BRANCH %bb.1
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: bb.1:
+ ; GCN8-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GCN8-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: bb.2:
+ ; GCN8-NEXT: successors: %bb.3(0x80000000)
+ ; GCN8-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: S_NOP 0
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: S_BRANCH %bb.3
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: bb.3:
+ ; GCN8-NEXT: liveins: $sgpr10_sgpr11
+ ; GCN8-NEXT: {{ $}}
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+ ; GCN8-NEXT: S_NOP 7
+ ; GCN8-NEXT: SI_RETURN
+ ;
+ ; GCN16-LABEL: name: test_snop_padding
+ ; GCN16: bb.0:
+ ; GCN16-NEXT: successors: %bb.1(0x80000000)
+ ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_BRANCH %bb.1
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: bb.1:
+ ; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: bb.2:
+ ; GCN16-NEXT: successors: %bb.3(0x80000000)
+ ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 0
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_BRANCH %bb.3
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: bb.3:
+ ; GCN16-NEXT: liveins: $sgpr10_sgpr11
+ ; GCN16-NEXT: {{ $}}
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: SI_RETURN
+ bb.0:
+ liveins: $sgpr6, $sgpr10_sgpr11
+ S_BRANCH %bb.1
+ bb.1:
+ liveins: $sgpr6, $sgpr10_sgpr11
+ %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+ S_CBRANCH_EXECZ %bb.3, implicit $exec
+ bb.2:
+ liveins: $sgpr6, $sgpr10_sgpr11
+ SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ S_NOP 0
+ renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
+ %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+ S_BRANCH %bb.3
+ bb.3:
+ liveins: $sgpr10_sgpr11
+ $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
+ S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
+ SI_RETURN
+...
>From 18cd25113a3af4c02d6e9a8f9b5fde878e429052 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 1 Jul 2025 23:43:01 -0400
Subject: [PATCH 2/3] resolve review comments
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index f7f24c86948c4..5dde658ed0a79 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -46,8 +46,8 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
// This is intended for debugging purposes only.
static cl::opt<unsigned>
- NopPadding("amdgpu-snop-padding", cl::Hidden,
- cl::desc("Insert a s_nop x between every isntruction"));
+ NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
+ cl::desc("Insert a s_nop x between every instruction"));
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
@@ -305,8 +305,7 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
unsigned W = PreEmitNoopsCommon(MI);
fixHazards(MI);
CurrCycleInstr = nullptr;
- unsigned NopPad = NopPadding.getNumOccurrences() ? NopPadding : 0;
- return std::max(W, NopPad);
+ return std::max(W, NopPadding.getValue());
}
unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
>From 54bace4403cfb601b19a8be76e792390c1dc3b01 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 2 Jul 2025 08:06:39 -0400
Subject: [PATCH 3/3] fix comments
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5dde658ed0a79..0976fccf78d86 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -47,7 +47,7 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
// This is intended for debugging purposes only.
static cl::opt<unsigned>
NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
- cl::desc("Insert a s_nop x between every instruction"));
+ cl::desc("Insert a s_nop x before every instruction"));
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
More information about the llvm-commits
mailing list