[llvm] f10dc76 - [AMDGPU][NPM] Port SIInsertWaitcnts to NPM (#130061)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 24 09:06:49 PDT 2025
Author: Akshat Oke
Date: 2025-03-24T21:36:45+05:30
New Revision: f10dc76f034848879340c3004e573e7014293e01
URL: https://github.com/llvm/llvm-project/commit/f10dc76f034848879340c3004e573e7014293e01
DIFF: https://github.com/llvm/llvm-project/commit/f10dc76f034848879340c3004e573e7014293e01.diff
LOG: [AMDGPU][NPM] Port SIInsertWaitcnts to NPM (#130061)
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index eebc33aea7a86..ddb0c10604537 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -378,6 +378,13 @@ class AMDGPUMarkLastScratchLoadPass
MachineFunctionAnalysisManager &AM);
};
+class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -454,7 +461,7 @@ extern char &AMDGPUInsertDelayAluID;
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;
-void initializeSIInsertWaitcntsPass(PassRegistry&);
+void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
extern char &SIInsertWaitcntsID;
void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 388c390edad6a..68f2321432402 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -133,7 +134,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 827216f8fde59..0eb0594d87dac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -536,7 +536,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
initializeSIInsertHardClausesPass(*PR);
- initializeSIInsertWaitcntsPass(*PR);
+ initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
@@ -2158,7 +2158,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
}
addPass(SIMemoryLegalizerPass());
- // TODO: addPass(SIInsertWaitcntsPass());
+ addPass(SIInsertWaitcntsPass());
// TODO: addPass(SIModeRegisterPass());
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5906b1caeed3d..0edacadc4884f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -33,6 +33,7 @@
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -597,7 +598,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
-class SIInsertWaitcnts : public MachineFunctionPass {
+class SIInsertWaitcnts {
private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
@@ -636,9 +637,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
public:
- static char ID;
-
- SIInsertWaitcnts() : MachineFunctionPass(ID) {
+ SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+ AliasAnalysis *AA)
+ : MLI(MLI), PDT(PDT), AA(AA) {
(void)ForceExpCounter;
(void)ForceLgkmCounter;
(void)ForceVMCounter;
@@ -648,20 +649,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool isPreheaderToFlush(MachineBasicBlock &MBB,
WaitcntBrackets &ScoreBrackets);
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert wait instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<MachineLoopInfoWrapperPass>();
- AU.addRequired<MachinePostDominatorTreeWrapperPass>();
- AU.addUsedIfAvailable<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
+ bool run(MachineFunction &MF);
bool isForceEmitWaitcnt() const {
for (auto T : inst_counter_types())
@@ -749,6 +737,27 @@ class SIInsertWaitcnts : public MachineFunctionPass {
WaitcntBrackets &ScoreBrackets);
};
+class SIInsertWaitcntsLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI insert wait instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // end anonymous namespace
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
@@ -1133,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
return hasMixedPendingEvents(T);
}
-INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
+INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
+INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+ false, false)
-char SIInsertWaitcnts::ID = 0;
+char SIInsertWaitcntsLegacy::ID = 0;
-char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
FunctionPass *llvm::createSIInsertWaitcntsPass() {
- return new SIInsertWaitcnts();
+ return new SIInsertWaitcntsLegacy();
}
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
@@ -2481,16 +2490,40 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
}
-bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
+ auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ auto *PDT =
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+ AliasAnalysis *AA = nullptr;
+ if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAR->getAAResults();
+
+ return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
+}
+
+PreservedAnalyses
+SIInsertWaitcntsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+ auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
+ auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+ .getManager()
+ .getCachedResult<AAManager>(MF.getFunction());
+
+ if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
+ return PreservedAnalyses::all();
+
+ return getMachineFunctionPassPreservedAnalyses()
+ .preserveSet<CFGAnalyses>()
+ .preserve<AAManager>();
+}
+
+bool SIInsertWaitcnts::run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
- if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
- AA = &AAR->getAAResults();
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
index b6dc75db3edc1..f776c22866296 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
# $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
# that the return address is not clobbered in the callee by the outstanding load.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
index 28d79efc00b0d..2834ca5fa6858 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s
---
name: test
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 17e3d93ed393b..f5321591a3c88 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -2,6 +2,8 @@
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+
+# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
---
# CHECK-LABEL: name: vccz_corrupt_workaround
# CHECK: $vcc = V_CMP_EQ_F32
More information about the llvm-commits
mailing list