[llvm] [AMDGPU] Add flag to force emit s_nop (PR #117839)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 2 11:31:09 PST 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/117839

>From 1b645cf4482c13a1fff78a5dded2df66ae1a8c84 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 26 Nov 2024 19:31:56 -0800
Subject: [PATCH 1/4] [AMDGPU] Add flag to force emit s_nop

Change-Id: I1bddb498d73e0138d14a8cb312082e8794da2e47
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  15 ++-
 .../AMDGPU/snop-padding-terminator.mir        |  71 ++++++++++
 llvm/test/CodeGen/AMDGPU/snop-padding.ll      | 124 ++++++++++++++++++
 3 files changed, 206 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 44afccb0690d0d..870db75bc65b1b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -24,8 +24,8 @@ using namespace llvm;
 
 namespace {
 
-struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
-  MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
+struct PaddingRatioParser : public cl::parser<unsigned> {
+  PaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
 
   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
     if (Arg.getAsInteger(0, Value))
@@ -40,7 +40,7 @@ struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
 
 } // end anonymous namespace
 
-static cl::opt<unsigned, false, MFMAPaddingRatioParser>
+static cl::opt<unsigned, false, PaddingRatioParser>
     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
                      cl::desc("Fill a percentage of the latency between "
                               "neighboring MFMA with s_nops."));
@@ -49,6 +49,11 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
     cl::desc("Maximum function size for exhausive hazard search"));
 
+static cl::opt<unsigned, false, PaddingRatioParser>
+    NopPadding("amdgpu-snop-padding", cl::Hidden,
+               cl::desc("Insert a s_nop between every instruction for a given "
+                        "number of cycles."));
+
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -325,7 +330,9 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   unsigned W = PreEmitNoopsCommon(MI);
   fixHazards(MI);
   CurrCycleInstr = nullptr;
-  return W;
+  unsigned NopPad =
+      NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
+  return std::max(W, NopPad);
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
new file mode 100644
index 00000000000000..6960ac5b0b4bf6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
+
+---
+name: waitcnt-debug-non-first-terminators
+liveins:
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GCN-LABEL: name: waitcnt-debug-non-first-terminators
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_NOP 0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_NOP 0
+  ;
+  ; GCN-NOP1-LABEL: name: waitcnt-debug-non-first-terminators
+  ; GCN-NOP1: bb.0:
+  ; GCN-NOP1-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NOP1-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT: bb.1:
+  ; GCN-NOP1-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT: bb.2:
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ;
+  ; GCN-NOP20-LABEL: name: waitcnt-debug-non-first-terminators
+  ; GCN-NOP20: bb.0:
+  ; GCN-NOP20-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NOP20-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT: bb.1:
+  ; GCN-NOP20-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 3
+  ; GCN-NOP20-NEXT:   S_NOP 0
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT: bb.2:
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 3
+  ; GCN-NOP20-NEXT:   S_NOP 0
+  bb.0:
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+    S_BRANCH %bb.2, implicit $scc
+  bb.1:
+    S_NOP 0
+  bb.2:
+    S_NOP 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.ll b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
new file mode 100644
index 00000000000000..254507a54a2f75
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=0 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=1 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP1 %s
+; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=20 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP20 %s
+
+
+define amdgpu_kernel void @fadd_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: fadd_v4f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NEXT:    v_add_f32_e32 v3, s3, v0
+; GCN-NEXT:    v_add_f32_e32 v2, s2, v1
+; GCN-NEXT:    v_add_f32_e32 v1, s1, v5
+; GCN-NEXT:    v_add_f32_e32 v0, s0, v6
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NEXT:    s_endpgm
+;
+; GCN-NOP1-LABEL: fadd_v4f32:
+; GCN-NOP1:       ; %bb.0:
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_add_f32_e32 v3, s3, v0
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_add_f32_e32 v2, s2, v1
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_add_f32_e32 v1, s1, v5
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    v_add_f32_e32 v0, s0, v6
+; GCN-NOP1-NEXT:    s_nop 0
+; GCN-NOP1-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NOP1-NEXT:    s_endpgm
+;
+; GCN-NOP20-LABEL: fadd_v4f32:
+; GCN-NOP20:       ; %bb.0:
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_add_f32_e32 v3, s3, v0
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_add_f32_e32 v2, s2, v1
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_add_f32_e32 v1, s1, v5
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    v_add_f32_e32 v0, s0, v6
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 7
+; GCN-NOP20-NEXT:    s_nop 3
+; GCN-NOP20-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GCN-NOP20-NEXT:    s_endpgm
+  %b_ptr = getelementptr <4 x float>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x float>, ptr addrspace(1) %in, align 16
+  %b = load <4 x float>, ptr addrspace(1) %b_ptr, align 16
+  %result = fadd <4 x float> %a, %b
+  store <4 x float> %result, ptr addrspace(1) %out, align 16
+  ret void
+}

>From 844decba4cb04260ca6283508f7a773effb3fb2c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 26 Nov 2024 19:41:26 -0800
Subject: [PATCH 2/4] Fix comment

Change-Id: Ia83c67144b7c23fdec2912c72dd30956eba17f03
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 870db75bc65b1b..d7466aeada43e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -43,7 +43,7 @@ struct PaddingRatioParser : public cl::parser<unsigned> {
 static cl::opt<unsigned, false, PaddingRatioParser>
     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
                      cl::desc("Fill a percentage of the latency between "
-                              "neighboring MFMA with s_nops."));
+                              "neighboring instructions with s_nops."));
 
 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,

>From 3fc160b4f56410b5859351f055554095da1d21d5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 27 Nov 2024 14:22:31 -0800
Subject: [PATCH 3/4] Review comments

Change-Id: Icdd0c8777ce360a8fcb4dc599f95263a59b8fc4f
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   5 +-
 .../AMDGPU/snop-padding-terminator.mir        |  71 -------
 llvm/test/CodeGen/AMDGPU/snop-padding.ll      | 124 ------------
 llvm/test/CodeGen/AMDGPU/snop-padding.mir     | 185 ++++++++++++++++++
 4 files changed, 189 insertions(+), 196 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/snop-padding.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index d7466aeada43e7..e5d3b09165b6cb 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -51,7 +51,7 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
 
 static cl::opt<unsigned, false, PaddingRatioParser>
     NopPadding("amdgpu-snop-padding", cl::Hidden,
-               cl::desc("Insert a s_nop between every instruction for a given "
+               cl::desc("Insert a s_nop before every instruction for a given "
                         "number of cycles."));
 
 //===----------------------------------------------------------------------===//
@@ -291,6 +291,9 @@ void GCNHazardRecognizer::processBundle() {
   for (; MI != E && MI->isInsideBundle(); ++MI) {
     CurrCycleInstr = &*MI;
     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
+    unsigned NopPad =
+        NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
+    WaitStates = std::max(WaitStates, NopPad);
 
     if (IsHazardRecognizerMode) {
       fixHazards(CurrCycleInstr);
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir b/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
deleted file mode 100644
index 6960ac5b0b4bf6..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/snop-padding-terminator.mir
+++ /dev/null
@@ -1,71 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
-
----
-name: waitcnt-debug-non-first-terminators
-liveins:
-machineFunctionInfo:
-  isEntryFunction: true
-body:             |
-  ; GCN-LABEL: name: waitcnt-debug-non-first-terminators
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
-  ; GCN-NEXT:   S_BRANCH %bb.2, implicit $scc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   S_NOP 0
-  ;
-  ; GCN-NOP1-LABEL: name: waitcnt-debug-non-first-terminators
-  ; GCN-NOP1: bb.0:
-  ; GCN-NOP1-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GCN-NOP1-NEXT: {{  $}}
-  ; GCN-NOP1-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
-  ; GCN-NOP1-NEXT:   S_BRANCH %bb.2, implicit $scc
-  ; GCN-NOP1-NEXT: {{  $}}
-  ; GCN-NOP1-NEXT: bb.1:
-  ; GCN-NOP1-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NOP1-NEXT: {{  $}}
-  ; GCN-NOP1-NEXT:   S_NOP 0
-  ; GCN-NOP1-NEXT:   S_NOP 0
-  ; GCN-NOP1-NEXT: {{  $}}
-  ; GCN-NOP1-NEXT: bb.2:
-  ; GCN-NOP1-NEXT:   S_NOP 0
-  ; GCN-NOP1-NEXT:   S_NOP 0
-  ;
-  ; GCN-NOP20-LABEL: name: waitcnt-debug-non-first-terminators
-  ; GCN-NOP20: bb.0:
-  ; GCN-NOP20-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GCN-NOP20-NEXT: {{  $}}
-  ; GCN-NOP20-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
-  ; GCN-NOP20-NEXT:   S_BRANCH %bb.2, implicit $scc
-  ; GCN-NOP20-NEXT: {{  $}}
-  ; GCN-NOP20-NEXT: bb.1:
-  ; GCN-NOP20-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NOP20-NEXT: {{  $}}
-  ; GCN-NOP20-NEXT:   S_NOP 7
-  ; GCN-NOP20-NEXT:   S_NOP 7
-  ; GCN-NOP20-NEXT:   S_NOP 3
-  ; GCN-NOP20-NEXT:   S_NOP 0
-  ; GCN-NOP20-NEXT: {{  $}}
-  ; GCN-NOP20-NEXT: bb.2:
-  ; GCN-NOP20-NEXT:   S_NOP 7
-  ; GCN-NOP20-NEXT:   S_NOP 7
-  ; GCN-NOP20-NEXT:   S_NOP 3
-  ; GCN-NOP20-NEXT:   S_NOP 0
-  bb.0:
-    S_CBRANCH_SCC1 %bb.1, implicit $scc
-    S_BRANCH %bb.2, implicit $scc
-  bb.1:
-    S_NOP 0
-  bb.2:
-    S_NOP 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.ll b/llvm/test/CodeGen/AMDGPU/snop-padding.ll
deleted file mode 100644
index 254507a54a2f75..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/snop-padding.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=0 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=1 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP1 %s
-; RUN: llc -mtriple=amdgcn  --amdgpu-snop-padding=20 -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN-NOP20 %s
-
-
-define amdgpu_kernel void @fadd_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; GCN-LABEL: fadd_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NEXT:    v_add_f32_e32 v3, s3, v0
-; GCN-NEXT:    v_add_f32_e32 v2, s2, v1
-; GCN-NEXT:    v_add_f32_e32 v1, s1, v5
-; GCN-NEXT:    v_add_f32_e32 v0, s0, v6
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NEXT:    s_endpgm
-;
-; GCN-NOP1-LABEL: fadd_v4f32:
-; GCN-NOP1:       ; %bb.0:
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_add_f32_e32 v3, s3, v0
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_add_f32_e32 v2, s2, v1
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_add_f32_e32 v1, s1, v5
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    v_add_f32_e32 v0, s0, v6
-; GCN-NOP1-NEXT:    s_nop 0
-; GCN-NOP1-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NOP1-NEXT:    s_endpgm
-;
-; GCN-NOP20-LABEL: fadd_v4f32:
-; GCN-NOP20:       ; %bb.0:
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_add_f32_e32 v3, s3, v0
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_add_f32_e32 v2, s2, v1
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_add_f32_e32 v1, s1, v5
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    v_add_f32_e32 v0, s0, v6
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 7
-; GCN-NOP20-NEXT:    s_nop 3
-; GCN-NOP20-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
-; GCN-NOP20-NEXT:    s_endpgm
-  %b_ptr = getelementptr <4 x float>, ptr addrspace(1) %in, i32 1
-  %a = load <4 x float>, ptr addrspace(1) %in, align 16
-  %b = load <4 x float>, ptr addrspace(1) %b_ptr, align 16
-  %result = fadd <4 x float> %a, %b
-  store <4 x float> %result, ptr addrspace(1) %out, align 16
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.mir b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
new file mode 100644
index 00000000000000..d33d59b45c470b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=0 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=1 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP1 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --amdgpu-snop-padding=20 --run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN-NOP20 %s
+
+# do not insert s_nop between terminators
+
+---
+name: multiple-terminators
+body:             |
+  ; GCN-LABEL: name: multiple-terminators
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+  ;
+  ; GCN-NOP1-LABEL: name: multiple-terminators
+  ; GCN-NOP1: bb.0:
+  ; GCN-NOP1-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NOP1-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NOP1-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT: bb.1:
+  ; GCN-NOP1-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ; GCN-NOP1-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+  ; GCN-NOP1-NEXT: {{  $}}
+  ; GCN-NOP1-NEXT: bb.2:
+  ; GCN-NOP1-NEXT:   S_NOP 0
+  ; GCN-NOP1-NEXT:   $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+  ;
+  ; GCN-NOP20-LABEL: name: multiple-terminators
+  ; GCN-NOP20: bb.0:
+  ; GCN-NOP20-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NOP20-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NOP20-NEXT:   S_BRANCH %bb.2, implicit $scc
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT: bb.1:
+  ; GCN-NOP20-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 3
+  ; GCN-NOP20-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+  ; GCN-NOP20-NEXT: {{  $}}
+  ; GCN-NOP20-NEXT: bb.2:
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 7
+  ; GCN-NOP20-NEXT:   S_NOP 3
+  ; GCN-NOP20-NEXT:   $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+    S_BRANCH %bb.2, implicit $scc
+  bb.1:
+    $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+  bb.2:
+    $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
+...
+
+# insert s_nop inside bundles
+
+---
+name:            bundle
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: bundle
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: }
+    ; GCN-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-NOP1-LABEL: name: bundle
+    ; GCN-NOP1: S_NOP 0
+    ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NOP1-NEXT:   S_NOP 0
+    ; GCN-NOP1-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP1-NEXT:   S_NOP 0
+    ; GCN-NOP1-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP1-NEXT:   S_NOP 0
+    ; GCN-NOP1-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP1-NEXT: }
+    ; GCN-NOP1-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-NOP20-LABEL: name: bundle
+    ; GCN-NOP20: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 3
+    ; GCN-NOP20-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 3
+    ; GCN-NOP20-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 7
+    ; GCN-NOP20-NEXT:   S_NOP 3
+    ; GCN-NOP20-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP20-NEXT: }
+    ; GCN-NOP20-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    BUNDLE implicit-def $sgpr0_sgpr1 {
+      $sgpr0_sgpr1 = S_GETPC_B64
+      $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+      $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    }
+    S_ENDPGM 0
+...
+
+
+---
+name:            standard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: standard
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-NOP1-LABEL: name: standard
+    ; GCN-NOP1: S_NOP 0
+    ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP1-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-NOP20-LABEL: name: standard
+    ; GCN-NOP20: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP20-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    S_ENDPGM 0
+...

>From 241abc33a43dd8dea3952003adab91f266672f3b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 2 Dec 2024 11:17:34 -0800
Subject: [PATCH 4/4] Fix bundle handling

Change-Id: I7de1185831662e8e8b8b5612c52f703a05a62d88
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  11 +-
 llvm/test/CodeGen/AMDGPU/snop-padding.mir     | 127 ++++++++++--------
 2 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index e5d3b09165b6cb..a6b8c4da47803b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -49,10 +49,10 @@ static cl::opt<unsigned> MaxExhaustiveHazardSearch(
     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
     cl::desc("Maximum function size for exhausive hazard search"));
 
-static cl::opt<unsigned, false, PaddingRatioParser>
-    NopPadding("amdgpu-snop-padding", cl::Hidden,
-               cl::desc("Insert a s_nop before every instruction for a given "
-                        "number of cycles."));
+static cl::opt<unsigned, false, PaddingRatioParser> NopPadding(
+    "amdgpu-snop-padding", cl::Hidden,
+    cl::desc("Insert a s_nop before every instruction for a given "
+             "number of cycles. Does not insert nops into bundles."));
 
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
@@ -291,9 +291,6 @@ void GCNHazardRecognizer::processBundle() {
   for (; MI != E && MI->isInsideBundle(); ++MI) {
     CurrCycleInstr = &*MI;
     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
-    unsigned NopPad =
-        NopPadding.getNumOccurrences() && !MI->isTerminator() ? NopPadding : 0;
-    WaitStates = std::max(WaitStates, NopPad);
 
     if (IsHazardRecognizerMode) {
       fixHazards(CurrCycleInstr);
diff --git a/llvm/test/CodeGen/AMDGPU/snop-padding.mir b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
index d33d59b45c470b..765d1205eb8872 100644
--- a/llvm/test/CodeGen/AMDGPU/snop-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/snop-padding.mir
@@ -6,6 +6,21 @@
 
 # do not insert s_nop between terminators
 
+--- |
+  define amdgpu_kernel void @multiple-terminators() { ret void }
+  define amdgpu_kernel void @standard() { ret void }
+
+  define amdgpu_kernel void @bundle() #0 {
+    %1 = call i32 @func()
+    ret void
+  }
+
+  declare hidden i32 @func() #0
+
+  attributes #0 = { nounwind }
+...
+
+
 ---
 name: multiple-terminators
 body:             |
@@ -75,36 +90,30 @@ body:             |
     $sgpr1 = S_ADD_U32 $sgpr1, 2, implicit-def $scc
 ...
 
-# insert s_nop inside bundles
 
 ---
-name:            bundle
+name:            standard
 body:            |
   bb.0:
-    ; GCN-LABEL: name: bundle
+    ; GCN-LABEL: name: standard
     ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
-    ; GCN-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-    ; GCN-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
     ; GCN-NEXT: S_ENDPGM 0
     ;
-    ; GCN-NOP1-LABEL: name: bundle
+    ; GCN-NOP1-LABEL: name: standard
     ; GCN-NOP1: S_NOP 0
     ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GCN-NOP1-NEXT: S_NOP 0
-    ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
-    ; GCN-NOP1-NEXT:   S_NOP 0
-    ; GCN-NOP1-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NOP1-NEXT:   S_NOP 0
-    ; GCN-NOP1-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-    ; GCN-NOP1-NEXT:   S_NOP 0
-    ; GCN-NOP1-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
-    ; GCN-NOP1-NEXT: }
+    ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP1-NEXT: S_NOP 0
+    ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
     ; GCN-NOP1-NEXT: S_ENDPGM 0
     ;
-    ; GCN-NOP20-LABEL: name: bundle
+    ; GCN-NOP20-LABEL: name: standard
     ; GCN-NOP20: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 3
@@ -112,54 +121,53 @@ body:            |
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 3
-    ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 3
-    ; GCN-NOP20-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 3
-    ; GCN-NOP20-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 7
-    ; GCN-NOP20-NEXT:   S_NOP 3
-    ; GCN-NOP20-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
-    ; GCN-NOP20-NEXT: }
+    ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 7
+    ; GCN-NOP20-NEXT: S_NOP 3
+    ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
     ; GCN-NOP20-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    BUNDLE implicit-def $sgpr0_sgpr1 {
-      $sgpr0_sgpr1 = S_GETPC_B64
-      $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-      $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
-    }
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
     S_ENDPGM 0
 ...
 
+# Do not insert s_nop inside bundles to preserve potential pc relative offsets.
 
 ---
-name:            standard
+name:            bundle
 body:            |
   bb.0:
-    ; GCN-LABEL: name: standard
+    ; GCN-LABEL: name: bundle
     ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-    ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+    ; GCN-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: }
+    ; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
     ; GCN-NEXT: S_ENDPGM 0
     ;
-    ; GCN-NOP1-LABEL: name: standard
+    ; GCN-NOP1-LABEL: name: bundle
     ; GCN-NOP1: S_NOP 0
     ; GCN-NOP1-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GCN-NOP1-NEXT: S_NOP 0
-    ; GCN-NOP1-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NOP1-NEXT: S_NOP 0
-    ; GCN-NOP1-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP1-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NOP1-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP1-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+    ; GCN-NOP1-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+    ; GCN-NOP1-NEXT: }
     ; GCN-NOP1-NEXT: S_NOP 0
-    ; GCN-NOP1-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP1-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
     ; GCN-NOP1-NEXT: S_ENDPGM 0
     ;
-    ; GCN-NOP20-LABEL: name: standard
+    ; GCN-NOP20-LABEL: name: bundle
     ; GCN-NOP20: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 3
@@ -167,19 +175,24 @@ body:            |
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 3
-    ; GCN-NOP20-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NOP20-NEXT: S_NOP 7
-    ; GCN-NOP20-NEXT: S_NOP 7
-    ; GCN-NOP20-NEXT: S_NOP 3
-    ; GCN-NOP20-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
+    ; GCN-NOP20-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NOP20-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NOP20-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+    ; GCN-NOP20-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+    ; GCN-NOP20-NEXT: }
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 7
     ; GCN-NOP20-NEXT: S_NOP 3
-    ; GCN-NOP20-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    ; GCN-NOP20-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
     ; GCN-NOP20-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    $sgpr0_sgpr1 = S_GETPC_B64
-    $sgpr0 = S_ADD_U32 $sgpr0, 4, implicit-def $scc
-    $sgpr1 = S_ADDC_U32 $sgpr1, 2, implicit-def $scc, implicit $scc
+    BUNDLE implicit-def $sgpr0_sgpr1 {
+      $sgpr0_sgpr1 = S_GETPC_B64
+      $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @func + 4, implicit-def $scc
+      $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def $scc, implicit $scc
+    }
+    dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def dead $vgpr0
     S_ENDPGM 0
 ...
+
+



More information about the llvm-commits mailing list