[llvm] [AMDGPU] Add flag to force emit s_nop (PR #117839)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 2 11:31:09 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/117839
>From 1b645cf4482c13a1fff78a5dded2df66ae1a8c84 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 26 Nov 2024 19:31:56 -0800
Subject: [PATCH 1/4] [AMDGPU] Add flag to force emit s_nop
Change-Id: I1bddb498d73e0138d14a8cb312082e8794da2e47
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 15 ++-
.../AMDGPU/snop-padding-terminator.mir | 71 ++++++++++
llvm/test/CodeGen/AMDGPU/snop-padding.ll | 124 ++++++++++++++++++
3 files changed, 206 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.ll
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 44afccb0690d0d..870db75bc65b1b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -24,8 +24,8 @@ using namespace llvm;
namespace {
-struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
- MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
+struct PaddingRatioParser : public cl::parser<unsigned> {
+ PaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
if (Arg.getAsInteger(0, Value))
@@ -40,7 +40,7 @@ struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
} // end anonymous namespace
-static cl::opt<unsigned, false, MFMAPaddingRatioParser>
+static cl::opt<unsigned, false, PaddingRatioParser>
MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
cl::desc("Fill a percentage of the latency between "
"neighboring MFMA with s_nops."));
@@ -49,6 +49,11 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
"amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
cl::desc("Maximum function size for exhausive hazard search"));
+static cl::opt<unsigned, false, PaddingRatioParser>
+ NopPadding("amdgpu-snop-padding", cl::Hidden,
+ cl::desc("Insert a s_nop between every instruction for a given "
+ "number of cycles."));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -325,7 +330,9 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
unsigned W = PreEmitNoopsCommon(MI);
fixHazards(MI);
CurrCycleInstr = nullptr;
- return W;
+ unsigned NopPad =
+ NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
+ return std::max(W, NopPad);
}
unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
new file mode 100644
index 00000000000000..6960ac5b0b4bf6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
+
+---
+name: waitcnt-debug-non-first-terminators
+liveins:
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: waitcnt-debug-non-first-terminators
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_NOP 0
+ ;
+ ; GCN-NOP1-LABEL: name: waitcnt-debug-non-first-terminators
+ ; GCN-NOP1: bb.0:
+ ; GCN-NOP1-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NOP1-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: bb.1:
+ ; GCN-NOP1-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: bb.2:
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ;
+ ; GCN-NOP20-LABEL: name: waitcnt-debug-non-first-terminators
+ ; GCN-NOP20: bb.0:
+ ; GCN-NOP20-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NOP20-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: bb.1:
+ ; GCN-NOP20-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: S_NOP 0
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: bb.2:
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: S_NOP 0
+ bb.0:
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2, implicit $scc
+ bb.1:
+ S_NOP 0
+ bb.2:
+ S_NOP 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.ll b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
new file mode 100644
index 00000000000000..254507a54a2f75
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=0 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=1 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP1 %s
+; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=20 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP20 %s
+
+
+define amdgpu_kernel void @fadd_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: fadd_v4f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NEXT: v_mov_b32_e32 v6, s4
+; GCN-NEXT: v_add_f32_e32 v3, s3, v0
+; GCN-NEXT: v_add_f32_e32 v2, s2, v1
+; GCN-NEXT: v_add_f32_e32 v1, s1, v5
+; GCN-NEXT: v_add_f32_e32 v0, s0, v6
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NEXT: s_endpgm
+;
+; GCN-NOP1-LABEL: fadd_v4f32:
+; GCN-NOP1: ; %bb.0:
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_mov_b32_e32 v6, s4
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_add_f32_e32 v3, s3, v0
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_add_f32_e32 v2, s2, v1
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_add_f32_e32 v1, s1, v5
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: v_add_f32_e32 v0, s0, v6
+; GCN-NOP1-NEXT: s_nop 0
+; GCN-NOP1-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NOP1-NEXT: s_endpgm
+;
+; GCN-NOP20-LABEL: fadd_v4f32:
+; GCN-NOP20: ; %bb.0:
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_mov_b32_e32 v6, s4
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_add_f32_e32 v3, s3, v0
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_add_f32_e32 v2, s2, v1
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_add_f32_e32 v1, s1, v5
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: v_add_f32_e32 v0, s0, v6
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 7
+; GCN-NOP20-NEXT: s_nop 3
+; GCN-NOP20-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NOP20-NEXT: s_endpgm
+ %b_ptr = getelementptr <4 x float>, ptr addrspace(1) %in, i32 1
+ %a = load <4 x float>, ptr addrspace(1) %in, align 16
+ %b = load <4 x float>, ptr addrspace(1) %b_ptr, align 16
+ %result = fadd <4 x float> %a, %b
+ store <4 x float> %result, ptr addrspace(1) %out, align 16
+ ret void
+}
>From 844decba4cb04260ca6283508f7a773effb3fb2c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 26 Nov 2024 19:41:26 -0800
Subject: [PATCH 2/4] Fix comment
Change-Id: Ia83c67144b7c23fdec2912c72dd30956eba17f03
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 870db75bc65b1b..d7466aeada43e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -43,7 +43,7 @@ struct PaddingRatioParser : public cl::parser<unsigned> {
static cl::opt<unsigned, false, PaddingRatioParser>
MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
cl::desc("Fill a percentage of the latency between "
- "neighboring MFMA with s_nops."));
+ "neighboring instructions with s_nops."));
static cl::opt<unsigned> MaxExhaustiveHazardSearch(
"amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
>From 3fc160b4f56410b5859351f055554095da1d21d5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 27 Nov 2024 14:22:31 -0800
Subject: [PATCH 3/4] Review comments
Change-Id: Icdd0c8777ce360a8fcb4dc599f95263a59b8fc4f
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 5 +-
.../AMDGPU/snop-padding-terminator.mir | 71 -------
llvm/test/CodeGen/AMDGPU/snop-padding.ll | 124 ------------
llvm/test/CodeGen/AMDGPU/snop-padding.mir | 185 ++++++++++++++++++
4 files changed, 189 insertions(+), 196 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
delete mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index d7466aeada43e7..e5d3b09165b6cb 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -51,7 +51,7 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
static cl::opt<unsigned, false, PaddingRatioParser>
NopPadding("amdgpu-snop-padding", cl::Hidden,
- cl::desc("Insert a s_nop between every instruction for a given "
+ cl::desc("Insert a s_nop before every instruction for a given "
"number of cycles."));
//===----------------------------------------------------------------------===//
@@ -291,6 +291,9 @@ void GCNHazardRecognizer::processBundle() {
for (; MI != E && MI->isInsideBundle(); ++MI) {
CurrCycleInstr = &*MI;
unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
+ unsigned NopPad =
+ NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
+ WaitStates = std::max(WaitStates, NopPad);
if (IsHazardRecognizerMode) {
fixHazards(CurrCycleInstr);
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
deleted file mode 100644
index 6960ac5b0b4bf6..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
+++ /dev/null
@@ -1,71 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
-
----
-name: waitcnt-debug-non-first-terminators
-liveins:
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- ; GCN-LABEL: name: waitcnt-debug-non-first-terminators
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
- ; GCN-NEXT: S_BRANCH %bb.2, implicit $scc
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: S_NOP 0
- ;
- ; GCN-NOP1-LABEL: name: waitcnt-debug-non-first-terminators
- ; GCN-NOP1: bb.0:
- ; GCN-NOP1-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NOP1-NEXT: {{ $}}
- ; GCN-NOP1-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
- ; GCN-NOP1-NEXT: S_BRANCH %bb.2, implicit $scc
- ; GCN-NOP1-NEXT: {{ $}}
- ; GCN-NOP1-NEXT: bb.1:
- ; GCN-NOP1-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NOP1-NEXT: {{ $}}
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: {{ $}}
- ; GCN-NOP1-NEXT: bb.2:
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: S_NOP 0
- ;
- ; GCN-NOP20-LABEL: name: waitcnt-debug-non-first-terminators
- ; GCN-NOP20: bb.0:
- ; GCN-NOP20-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NOP20-NEXT: {{ $}}
- ; GCN-NOP20-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
- ; GCN-NOP20-NEXT: S_BRANCH %bb.2, implicit $scc
- ; GCN-NOP20-NEXT: {{ $}}
- ; GCN-NOP20-NEXT: bb.1:
- ; GCN-NOP20-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NOP20-NEXT: {{ $}}
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: S_NOP 0
- ; GCN-NOP20-NEXT: {{ $}}
- ; GCN-NOP20-NEXT: bb.2:
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: S_NOP 0
- bb.0:
- S_CBRANCH_SCC1 %bb.1, implicit $scc
- S_BRANCH %bb.2, implicit $scc
- bb.1:
- S_NOP 0
- bb.2:
- S_NOP 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.ll b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
deleted file mode 100644
index 254507a54a2f75..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/snop-padding.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=0 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=1 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP1 %s
-; RUN: llc -mtriple=amdgcn --amdgpu-snop-padding=20 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP20 %s
-
-
-define amdgpu_kernel void @fadd_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; GCN-LABEL: fadd_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: v_mov_b32_e32 v1, s6
-; GCN-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NEXT: v_mov_b32_e32 v6, s4
-; GCN-NEXT: v_add_f32_e32 v3, s3, v0
-; GCN-NEXT: v_add_f32_e32 v2, s2, v1
-; GCN-NEXT: v_add_f32_e32 v1, s1, v5
-; GCN-NEXT: v_add_f32_e32 v0, s0, v6
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NEXT: s_endpgm
-;
-; GCN-NOP1-LABEL: fadd_v4f32:
-; GCN-NOP1: ; %bb.0:
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_mov_b32_e32 v1, s6
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_mov_b32_e32 v6, s4
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_add_f32_e32 v3, s3, v0
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_add_f32_e32 v2, s2, v1
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_add_f32_e32 v1, s1, v5
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: v_add_f32_e32 v0, s0, v6
-; GCN-NOP1-NEXT: s_nop 0
-; GCN-NOP1-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NOP1-NEXT: s_endpgm
-;
-; GCN-NOP20-LABEL: fadd_v4f32:
-; GCN-NOP20: ; %bb.0:
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_mov_b32_e32 v1, s6
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_mov_b32_e32 v6, s4
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_add_f32_e32 v3, s3, v0
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_add_f32_e32 v2, s2, v1
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_add_f32_e32 v1, s1, v5
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: v_add_f32_e32 v0, s0, v6
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 7
-; GCN-NOP20-NEXT: s_nop 3
-; GCN-NOP20-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NOP20-NEXT: s_endpgm
- %b_ptr = getelementptr <4 x float>, ptr addrspace(1) %in, i32 1
- %a = load <4 x float>, ptr addrspace(1) %in, align 16
- %b = load <4 x float>, ptr addrspace(1) %b_ptr, align 16
- %result = fadd <4 x float> %a, %b
- store <4 x float> %result, ptr addrspace(1) %out, align 16
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.mir b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
new file mode 100644
index 00000000000000..d33d59b45c470b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
+
+# do not insert s_nop between terminators
+
+---
+name: multiple-terminators
+body: |
+ ; GCN-LABEL: name: multiple-terminators
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+ ;
+ ; GCN-NOP1-LABEL: name: multiple-terminators
+ ; GCN-NOP1: bb.0:
+ ; GCN-NOP1-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NOP1-NEXT: liveins: $sgpr0, $sgpr1
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NOP1-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: bb.1:
+ ; GCN-NOP1-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: {{ $}}
+ ; GCN-NOP1-NEXT: bb.2:
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+ ;
+ ; GCN-NOP20-LABEL: name: multiple-terminators
+ ; GCN-NOP20: bb.0:
+ ; GCN-NOP20-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NOP20-NEXT: liveins: $sgpr0, $sgpr1
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; GCN-NOP20-NEXT: S_BRANCH %bb.2, implicit $scc
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: bb.1:
+ ; GCN-NOP20-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: {{ $}}
+ ; GCN-NOP20-NEXT: bb.2:
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2, implicit $scc
+ bb.1:
+ $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ bb.2:
+ $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+...
+
+# insert s_nop inside bundles
+
+---
+name: bundle
+body: |
+ bb.0:
+ ; GCN-LABEL: name: bundle
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: }
+ ; GCN-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-NOP1-LABEL: name: bundle
+ ; GCN-NOP1: S_NOP 0
+ ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP1-NEXT: }
+ ; GCN-NOP1-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-NOP20-LABEL: name: bundle
+ ; GCN-NOP20: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP20-NEXT: }
+ ; GCN-NOP20-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ }
+ S_ENDPGM 0
+...
+
+
+---
+name: standard
+body: |
+ bb.0:
+ ; GCN-LABEL: name: standard
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-NOP1-LABEL: name: standard
+ ; GCN-NOP1: S_NOP 0
+ ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP1-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-NOP20-LABEL: name: standard
+ ; GCN-NOP20: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP20-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ S_ENDPGM 0
+...
>From 241abc33a43dd8dea3952003adab91f266672f3b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 2 Dec 2024 11:17:34 -0800
Subject: [PATCH 4/4] Fix bundle handling
Change-Id: I7de1185831662e8e8b8b5612c52f703a05a62d88
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 11 +-
llvm/test/CodeGen/AMDGPU/snop-padding.mir | 127 ++++++++++--------
2 files changed, 74 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index e5d3b09165b6cb..a6b8c4da47803b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -49,10 +49,10 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
"amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
cl::desc("Maximum function size for exhausive hazard search"));
-static cl::opt<unsigned, false, PaddingRatioParser>
- NopPadding("amdgpu-snop-padding", cl::Hidden,
- cl::desc("Insert a s_nop before every instruction for a given "
- "number of cycles."));
+static cl::opt<unsigned, false, PaddingRatioParser> NopPadding(
+ "amdgpu-snop-padding", cl::Hidden,
+ cl::desc("Insert a s_nop before every instruction for a given "
+ "number of cycles. Does not insert nops into bundles."));
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
@@ -291,9 +291,6 @@ void GCNHazardRecognizer::processBundle() {
for (; MI != E && MI->isInsideBundle(); ++MI) {
CurrCycleInstr = &*MI;
unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
- unsigned NopPad =
- NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
- WaitStates = std::max(WaitStates, NopPad);
if (IsHazardRecognizerMode) {
fixHazards(CurrCycleInstr);
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.mir b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
index d33d59b45c470b..765d1205eb8872 100644
--- a/llvm/test/CodeGen/AMDGPU/snop-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
@@ -6,6 +6,21 @@
# do not insert s_nop between terminators
+--- |
+ define amdgpu_kernel void @multiple-terminators() { ret void }
+ define amdgpu_kernel void @standard() { ret void }
+
+ define amdgpu_kernel void @bundle() #0 {
+ %1 = call i32 @func()
+ ret void
+ }
+
+ declare hidden i32 @func() #0
+
+ attributes #0 = { nounwind }
+...
+
+
---
name: multiple-terminators
body: |
@@ -75,36 +90,30 @@ body: |
$sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
...
-# insert s_nop inside bundles
---
-name: bundle
+name: standard
body: |
bb.0:
- ; GCN-LABEL: name: bundle
+ ; GCN-LABEL: name: standard
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
- ; GCN-NEXT: }
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
; GCN-NEXT: S_ENDPGM 0
;
- ; GCN-NOP1-LABEL: name: bundle
+ ; GCN-NOP1-LABEL: name: standard
; GCN-NOP1: S_NOP 0
; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
- ; GCN-NOP1-NEXT: }
+ ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: S_NOP 0
+ ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
; GCN-NOP1-NEXT: S_ENDPGM 0
;
- ; GCN-NOP20-LABEL: name: bundle
+ ; GCN-NOP20-LABEL: name: standard
; GCN-NOP20: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 3
@@ -112,54 +121,53 @@ body: |
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
- ; GCN-NOP20-NEXT: }
+ ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 7
+ ; GCN-NOP20-NEXT: S_NOP 3
+ ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
; GCN-NOP20-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- BUNDLE implicit-def $sgpr0_sgpr1 {
- $sgpr0_sgpr1 = S_GETPC_B64
- $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
- }
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
S_ENDPGM 0
...
+# Do not insert s_nop inside bundles to preserve potential pc relative offsets.
---
-name: standard
+name: bundle
body: |
bb.0:
- ; GCN-LABEL: name: standard
+ ; GCN-LABEL: name: bundle
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: }
+ ; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
; GCN-NEXT: S_ENDPGM 0
;
- ; GCN-NOP1-LABEL: name: standard
+ ; GCN-NOP1-LABEL: name: bundle
; GCN-NOP1: S_NOP 0
; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+ ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+ ; GCN-NOP1-NEXT: }
; GCN-NOP1-NEXT: S_NOP 0
- ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP1-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
; GCN-NOP1-NEXT: S_ENDPGM 0
;
- ; GCN-NOP20-LABEL: name: standard
+ ; GCN-NOP20-LABEL: name: bundle
; GCN-NOP20: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 3
@@ -167,19 +175,24 @@ body: |
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 7
- ; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+ ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+ ; GCN-NOP20-NEXT: }
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 7
; GCN-NOP20-NEXT: S_NOP 3
- ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ ; GCN-NOP20-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
; GCN-NOP20-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- $sgpr0_sgpr1 = S_GETPC_B64
- $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
- $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+ }
+ dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
S_ENDPGM 0
...
+
+
More information about the llvm-commits
mailing list