[llvm] [AMDGPU] Support bottom-up postRA scheduing. (PR #135295)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 20 20:05:26 PDT 2025
https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/135295
>From 64c887fb8a873d648446a1c840ee87c941669167 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Wed, 9 Apr 2025 17:52:33 +0800
Subject: [PATCH 1/3] [AMDGPU] Support bottom-up postRA scheduling.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 46 ++++++++++++++++-
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 1 +
.../CodeGen/AMDGPU/sched-barrier-post-RA.mir | 50 ++++++++++++++++++-
3 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index aaefe27b1324f..43d04cbeb9451 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -284,6 +284,25 @@ void GCNHazardRecognizer::processBundle() {
CurrCycleInstr = nullptr;
}
+void GCNHazardRecognizer::reverseProcessBundle() {
+ MachineBasicBlock::instr_iterator MI =
+ std::next(CurrCycleInstr->getIterator());
+ MachineBasicBlock::instr_iterator E =
+ CurrCycleInstr->getParent()->instr_end();
+
+ for (; MI != E && MI->isInsideBundle(); ++MI) {
+ CurrCycleInstr = &*MI;
+ for (unsigned I = 0, E = MaxLookAhead - 1; I < E; ++I) {
+ if (!EmittedInstrs.empty())
+ EmittedInstrs.pop_back();
+ }
+
+ EmittedInstrs.push_back(CurrCycleInstr);
+ EmittedInstrs.resize(MaxLookAhead);
+ }
+ CurrCycleInstr = nullptr;
+}
+
void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
assert(IsHazardRecognizerMode);
@@ -423,7 +442,32 @@ void GCNHazardRecognizer::AdvanceCycle() {
}
void GCNHazardRecognizer::RecedeCycle() {
- llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+ if (!CurrCycleInstr) {
+ if (!EmittedInstrs.empty())
+ EmittedInstrs.pop_back();
+ return;
+ }
+
+ if (CurrCycleInstr->isBundle()) {
+ reverseProcessBundle();
+ return;
+ }
+
+ unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
+ if (!NumWaitStates) {
+ CurrCycleInstr = nullptr;
+ return;
+ }
+
+ EmittedInstrs.push_back(CurrCycleInstr);
+ for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); i < e;
+ ++i) {
+ if (!EmittedInstrs.empty())
+ EmittedInstrs.pop_back();
+ }
+
+ EmittedInstrs.resize(getMaxLookAhead());
+ CurrCycleInstr = nullptr;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bbc55851bf967..eed2561bad231 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -69,6 +69,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// Advance over a MachineInstr bundle. Look for hazards in the bundled
// instructions.
void processBundle();
+ void reverseProcessBundle();
// Run on an individual instruction in hazard recognizer mode. This can be
// used on a newly inserted instruction before returning from PreEmitNoops.
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
index 7bdb8f5b35ec5..02ebffca84bda 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -misched-postra-direction=bottomup -verify-misched -o - %s | FileCheck -check-prefix=CHECK-BOTTOMUP %s
--- |
define amdgpu_kernel void @no_sched_barrier(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
@@ -29,6 +30,21 @@ body: |
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: }
; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; CHECK-BOTTOMUP-LABEL: name: no_sched_barrier
+ ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0
renamable $sgpr0_sgpr1 = IMPLICIT_DEF
renamable $vgpr0 = IMPLICIT_DEF
BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
@@ -66,6 +82,22 @@ body: |
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: }
; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; CHECK-BOTTOMUP-LABEL: name: sched_barrier_mask_0
+ ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: SCHED_BARRIER 0
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0
renamable $sgpr0_sgpr1 = IMPLICIT_DEF
renamable $vgpr0 = IMPLICIT_DEF
BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
@@ -105,6 +137,22 @@ body: |
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: }
; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; CHECK-BOTTOMUP-LABEL: name: sched_barrier_mask_1
+ ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-BOTTOMUP-NEXT: SCHED_BARRIER 1
+ ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-BOTTOMUP-NEXT: }
+ ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0
renamable $sgpr0_sgpr1 = IMPLICIT_DEF
renamable $vgpr0 = IMPLICIT_DEF
BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
>From 84e68452fcfd59a3efab0fb8bbc8b1094d7256c9 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Apr 2025 11:10:54 +0800
Subject: [PATCH 2/3] [AMDGPU] Update comments.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 33 ++++++++++++++-----
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 5 ++-
2 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 43d04cbeb9451..205cb912126e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -284,22 +284,30 @@ void GCNHazardRecognizer::processBundle() {
CurrCycleInstr = nullptr;
}
-void GCNHazardRecognizer::reverseProcessBundle() {
+void GCNHazardRecognizer::processBundleBottomUp() {
+ // Step through each instruction in the bundle in bottom-up order.
MachineBasicBlock::instr_iterator MI =
std::next(CurrCycleInstr->getIterator());
MachineBasicBlock::instr_iterator E =
CurrCycleInstr->getParent()->instr_end();
+ // Evict stale entries to maintain a fixed lookahead window.
+ // TODO: Hazard detection is not yet implemented. This scheduling
+ // is intended for GFX11 and newer.
for (; MI != E && MI->isInsideBundle(); ++MI) {
CurrCycleInstr = &*MI;
- for (unsigned I = 0, E = MaxLookAhead - 1; I < E; ++I) {
- if (!EmittedInstrs.empty())
- EmittedInstrs.pop_back();
- }
+
+ // Remove up to (MaxLookAhead - 1) oldest entries.
+ for (unsigned I = 0, E = MaxLookAhead - 1; I < E && !EmittedInstrs.empty();
+ ++I)
+ EmittedInstrs.pop_back();
EmittedInstrs.push_back(CurrCycleInstr);
+
+ // Keep only the most recent MaxLookAhead entries
EmittedInstrs.resize(MaxLookAhead);
}
+
CurrCycleInstr = nullptr;
}
@@ -442,14 +450,16 @@ void GCNHazardRecognizer::AdvanceCycle() {
}
void GCNHazardRecognizer::RecedeCycle() {
+ // If no instruction was issued this cycle, pop the oldest placeholder.
if (!CurrCycleInstr) {
if (!EmittedInstrs.empty())
EmittedInstrs.pop_back();
return;
}
+ // If this is a bundle header, handle the entire bundle here.
if (CurrCycleInstr->isBundle()) {
- reverseProcessBundle();
+ processBundleBottomUp();
return;
}
@@ -459,14 +469,21 @@ void GCNHazardRecognizer::RecedeCycle() {
return;
}
+ // Add current instruction to the emitted list.
EmittedInstrs.push_back(CurrCycleInstr);
- for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); i < e;
- ++i) {
+
+ // Model remaining wait states by removing older placeholders.
+ for (unsigned I = 1, E = std::min(NumWaitStates, getMaxLookAhead()); I < E;
+ ++I) {
if (!EmittedInstrs.empty())
EmittedInstrs.pop_back();
}
+ // getMaxLookahead() is the largest number of wait states we will ever need
+ // to insert, so there is no point in keeping track of more than that many
+ // wait states.
EmittedInstrs.resize(getMaxLookAhead());
+
CurrCycleInstr = nullptr;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index eed2561bad231..88c7426be552d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -69,7 +69,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// Advance over a MachineInstr bundle. Look for hazards in the bundled
// instructions.
void processBundle();
- void reverseProcessBundle();
+ // Recede over a MachineInstr bundle. Adds bundled instructions to the
+ // EmittedInstrs queue in bottom-up scheduling mode.
+ // TODO: Hazard detection is not yet implemented.
+ void processBundleBottomUp();
// Run on an individual instruction in hazard recognizer mode. This can be
// used on a newly inserted instruction before returning from PreEmitNoops.
>From 4323b2bec8803500ab007a1c0199758070047783 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 21 Apr 2025 11:05:09 +0800
Subject: [PATCH 3/3] [AMDGPU] Add assert for Hazard for bottom up.
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 205cb912126e7..ea2dc494dabd9 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -285,6 +285,12 @@ void GCNHazardRecognizer::processBundle() {
}
void GCNHazardRecognizer::processBundleBottomUp() {
+ // Walk through the instructions in this bundle in bottom-up order.
+ // We only use this during post-RA scheduling, so hazard recognizer mode
+ // should never be active here (it always runs top-down).
+ assert(!IsHazardRecognizerMode &&
+ "Bottom-up scheduling shouldn't run in hazard recognizer mode");
+
// Step through each instruction in the bundle in bottom-up order.
MachineBasicBlock::instr_iterator MI =
std::next(CurrCycleInstr->getIterator());
More information about the llvm-commits
mailing list