[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port SIPreEmitPeephole to NPM (PR #130065)
Akshat Oke via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 9 21:42:30 PDT 2025
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/130065
>From 78bcc3a3576cc1f0dba5c9feb5ed781a62877ffe Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 10 Mar 2025 04:31:20 +0000
Subject: [PATCH 1/9] [AMDGPU][NFC] Format GCNCreateVOPD.cpp
---
llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index d40a1a2a10d9b..798279b279da3 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -38,15 +38,15 @@ namespace {
class GCNCreateVOPD : public MachineFunctionPass {
private:
- class VOPDCombineInfo {
- public:
- VOPDCombineInfo() = default;
- VOPDCombineInfo(MachineInstr *First, MachineInstr *Second)
- : FirstMI(First), SecondMI(Second) {}
-
- MachineInstr *FirstMI;
- MachineInstr *SecondMI;
- };
+ class VOPDCombineInfo {
+ public:
+ VOPDCombineInfo() = default;
+ VOPDCombineInfo(MachineInstr *First, MachineInstr *Second)
+ : FirstMI(First), SecondMI(Second) {}
+
+ MachineInstr *FirstMI;
+ MachineInstr *SecondMI;
+ };
public:
static char ID;
>From ab31097bd24434b6dca9eedae15acda3a50d5fbb Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 5 Mar 2025 10:52:00 +0000
Subject: [PATCH 2/9] [AMDGPU][NPM] Port GCNCreateVOPD to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 7 ++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +-
llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 53 ++++++++++++-------
4 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..f208a8bb9964b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -358,6 +358,11 @@ class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
};
+class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM);
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -443,7 +448,7 @@ extern char &SIFormMemoryClausesID;
void initializeSIPostRABundlerLegacyPass(PassRegistry &);
extern char &SIPostRABundlerLegacyID;
-void initializeGCNCreateVOPDPass(PassRegistry &);
+void initializeGCNCreateVOPDLegacyPass(PassRegistry &);
extern char &GCNCreateVOPDID;
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 1050855176c04..0e3dcb4267ede 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -103,6 +103,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
+MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..73ae9135eb319 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -546,7 +546,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIPreAllocateWWMRegsLegacyPass(*PR);
initializeSIFormMemoryClausesLegacyPass(*PR);
initializeSIPostRABundlerLegacyPass(*PR);
- initializeGCNCreateVOPDPass(*PR);
+ initializeGCNCreateVOPDLegacyPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -2149,7 +2149,7 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
- // TODO: addPass(GCNCreateVOPDPass());
+ addPass(GCNCreateVOPDPass());
}
// TODO: addPass(SIMemoryLegalizerPass());
// TODO: addPass(SIInsertWaitcntsPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 798279b279da3..32a26469d616b 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "gcn-create-vopd"
@@ -36,7 +37,7 @@ using namespace llvm;
namespace {
-class GCNCreateVOPD : public MachineFunctionPass {
+class GCNCreateVOPD {
private:
class VOPDCombineInfo {
public:
@@ -49,20 +50,8 @@ class GCNCreateVOPD : public MachineFunctionPass {
};
public:
- static char ID;
const GCNSubtarget *ST = nullptr;
- GCNCreateVOPD() : MachineFunctionPass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- StringRef getPassName() const override {
- return "GCN Create VOPD Instructions";
- }
-
bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) {
auto *FirstMI = CI.FirstMI;
auto *SecondMI = CI.SecondMI;
@@ -112,9 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass {
return true;
}
- bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(MF.getFunction()))
- return false;
+ bool run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32())
return false;
@@ -163,11 +150,39 @@ class GCNCreateVOPD : public MachineFunctionPass {
}
};
+class GCNCreateVOPDLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ GCNCreateVOPDLegacy() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "GCN Create VOPD Instructions";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ return GCNCreateVOPD().run(MF);
+ }
+};
+
} // namespace
-char GCNCreateVOPD::ID = 0;
+PreservedAnalyses llvm::GCNCreateVOPDPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &AM) {
+ if (!GCNCreateVOPD().run(MF))
+ return PreservedAnalyses::all();
+ return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
+}
+
+char GCNCreateVOPDLegacy::ID = 0;
-char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID;
+char &llvm::GCNCreateVOPDID = GCNCreateVOPDLegacy::ID;
-INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions",
+INITIALIZE_PASS(GCNCreateVOPDLegacy, DEBUG_TYPE, "GCN Create VOPD Instructions",
false, false)
>From 9d01cd56b51b13083c61b92cf10a97cfa4ac077b Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 10 Mar 2025 04:27:24 +0000
Subject: [PATCH 3/9] clang format
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 +-
llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 186 ++++++++++++-----------
2 files changed, 97 insertions(+), 92 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f208a8bb9964b..f331f741e3993 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -360,7 +360,8 @@ class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> {
public:
- PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM);
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &AM);
};
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 32a26469d616b..22123f738c948 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -49,105 +49,108 @@ class GCNCreateVOPD {
MachineInstr *SecondMI;
};
-public:
- const GCNSubtarget *ST = nullptr;
-
- bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) {
- auto *FirstMI = CI.FirstMI;
- auto *SecondMI = CI.SecondMI;
- unsigned Opc1 = FirstMI->getOpcode();
- unsigned Opc2 = SecondMI->getOpcode();
- unsigned EncodingFamily =
- AMDGPU::getVOPDEncodingFamily(SII->getSubtarget());
- int NewOpcode =
- AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
- AMDGPU::getVOPDOpcode(Opc2), EncodingFamily);
- assert(NewOpcode != -1 &&
- "Should have previously determined this as a possible VOPD\n");
-
- auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI,
- FirstMI->getDebugLoc(), SII->get(NewOpcode))
- .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags());
-
- namespace VOPD = AMDGPU::VOPD;
- MachineInstr *MI[] = {FirstMI, SecondMI};
- auto InstInfo =
- AMDGPU::getVOPDInstInfo(FirstMI->getDesc(), SecondMI->getDesc());
-
- for (auto CompIdx : VOPD::COMPONENTS) {
- auto MCOprIdx = InstInfo[CompIdx].getIndexOfDstInMCOperands();
- VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
- }
-
- for (auto CompIdx : VOPD::COMPONENTS) {
- auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum();
- for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; ++CompSrcIdx) {
- auto MCOprIdx = InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx);
+ public:
+ const GCNSubtarget *ST = nullptr;
+
+ bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) {
+ auto *FirstMI = CI.FirstMI;
+ auto *SecondMI = CI.SecondMI;
+ unsigned Opc1 = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ unsigned EncodingFamily =
+ AMDGPU::getVOPDEncodingFamily(SII->getSubtarget());
+ int NewOpcode =
+ AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
+ AMDGPU::getVOPDOpcode(Opc2), EncodingFamily);
+ assert(NewOpcode != -1 &&
+ "Should have previously determined this as a possible VOPD\n");
+
+ auto VOPDInst =
+ BuildMI(*FirstMI->getParent(), FirstMI, FirstMI->getDebugLoc(),
+ SII->get(NewOpcode))
+ .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags());
+
+ namespace VOPD = AMDGPU::VOPD;
+ MachineInstr *MI[] = {FirstMI, SecondMI};
+ auto InstInfo =
+ AMDGPU::getVOPDInstInfo(FirstMI->getDesc(), SecondMI->getDesc());
+
+ for (auto CompIdx : VOPD::COMPONENTS) {
+ auto MCOprIdx = InstInfo[CompIdx].getIndexOfDstInMCOperands();
VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
}
- }
- SII->fixImplicitOperands(*VOPDInst);
- for (auto CompIdx : VOPD::COMPONENTS)
- VOPDInst.copyImplicitOps(*MI[CompIdx]);
+ for (auto CompIdx : VOPD::COMPONENTS) {
+ auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum();
+ for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum;
+ ++CompSrcIdx) {
+ auto MCOprIdx =
+ InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx);
+ VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
+ }
+ }
- LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: "
- << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n");
+ SII->fixImplicitOperands(*VOPDInst);
+ for (auto CompIdx : VOPD::COMPONENTS)
+ VOPDInst.copyImplicitOps(*MI[CompIdx]);
- for (auto CompIdx : VOPD::COMPONENTS)
- MI[CompIdx]->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: "
+ << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n");
- ++NumVOPDCreated;
- return true;
- }
+ for (auto CompIdx : VOPD::COMPONENTS)
+ MI[CompIdx]->eraseFromParent();
- bool run(MachineFunction &MF) {
- ST = &MF.getSubtarget<GCNSubtarget>();
- if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32())
- return false;
- LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
-
- const SIInstrInfo *SII = ST->getInstrInfo();
- bool Changed = false;
-
- SmallVector<VOPDCombineInfo> ReplaceCandidates;
-
- for (auto &MBB : MF) {
- auto MII = MBB.begin(), E = MBB.end();
- while (MII != E) {
- auto *FirstMI = &*MII;
- MII = next_nodbg(MII, MBB.end());
- if (MII == MBB.end())
- break;
- if (FirstMI->isDebugInstr())
- continue;
- auto *SecondMI = &*MII;
- unsigned Opc = FirstMI->getOpcode();
- unsigned Opc2 = SecondMI->getOpcode();
- llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
- llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
- VOPDCombineInfo CI;
-
- if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
- CI = VOPDCombineInfo(FirstMI, SecondMI);
- else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
- CI = VOPDCombineInfo(SecondMI, FirstMI);
- else
- continue;
- // checkVOPDRegConstraints cares about program order, but doReplace
- // cares about X-Y order in the constituted VOPD
- if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
- ReplaceCandidates.push_back(CI);
- ++MII;
+ ++NumVOPDCreated;
+ return true;
+ }
+
+ bool run(MachineFunction &MF) {
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32())
+ return false;
+ LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
+
+ const SIInstrInfo *SII = ST->getInstrInfo();
+ bool Changed = false;
+
+ SmallVector<VOPDCombineInfo> ReplaceCandidates;
+
+ for (auto &MBB : MF) {
+ auto MII = MBB.begin(), E = MBB.end();
+ while (MII != E) {
+ auto *FirstMI = &*MII;
+ MII = next_nodbg(MII, MBB.end());
+ if (MII == MBB.end())
+ break;
+ if (FirstMI->isDebugInstr())
+ continue;
+ auto *SecondMI = &*MII;
+ unsigned Opc = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+ llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+ VOPDCombineInfo CI;
+
+ if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+ CI = VOPDCombineInfo(FirstMI, SecondMI);
+ else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+ CI = VOPDCombineInfo(SecondMI, FirstMI);
+ else
+ continue;
+ // checkVOPDRegConstraints cares about program order, but doReplace
+ // cares about X-Y order in the constituted VOPD
+ if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
+ ReplaceCandidates.push_back(CI);
+ ++MII;
+ }
}
}
- }
- for (auto &CI : ReplaceCandidates) {
- Changed |= doReplace(SII, CI);
- }
+ for (auto &CI : ReplaceCandidates) {
+ Changed |= doReplace(SII, CI);
+ }
- return Changed;
- }
+ return Changed;
+ }
};
class GCNCreateVOPDLegacy : public MachineFunctionPass {
@@ -173,8 +176,9 @@ class GCNCreateVOPDLegacy : public MachineFunctionPass {
} // namespace
-PreservedAnalyses llvm::GCNCreateVOPDPass::run(MachineFunction &MF,
- MachineFunctionAnalysisManager &AM) {
+PreservedAnalyses
+llvm::GCNCreateVOPDPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &AM) {
if (!GCNCreateVOPD().run(MF))
return PreservedAnalyses::all();
return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
>From b1402edb380ddf044af4810a9b7a88c4f874c0ed Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 5 Mar 2025 11:06:40 +0000
Subject: [PATCH 4/9] [AMDGPU][NPM] Port SIMemoryLegalizer to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++-
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 43 ++++++++++++++-----
4 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f331f741e3993..4197a60e77014 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -364,6 +364,13 @@ class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> {
MachineFunctionAnalysisManager &AM);
};
+class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -428,7 +435,7 @@ class SIAnnotateControlFlowPass
void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &);
extern char &SIAnnotateControlFlowLegacyPassID;
-void initializeSIMemoryLegalizerPass(PassRegistry&);
+void initializeSIMemoryLegalizerLegacyPass(PassRegistry &);
extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0e3dcb4267ede..de959f8a2aa62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -113,6 +113,7 @@ MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
MACHINE_FUNCTION_PASS("si-lower-wwm-copies", SILowerWWMCopiesPass())
+MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass())
MACHINE_FUNCTION_PASS("si-mode-register", SIModeRegisterPass())
MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass())
MACHINE_FUNCTION_PASS("si-optimize-exec-masking", SIOptimizeExecMaskingPass())
@@ -132,7 +133,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPas
DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
// already exists.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 73ae9135eb319..dbe212ad0a216 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -541,7 +541,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerControlFlowLegacyPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
initializeSILateBranchLoweringPass(*PR);
- initializeSIMemoryLegalizerPass(*PR);
+ initializeSIMemoryLegalizerLegacyPass(*PR);
initializeSIOptimizeExecMaskingLegacyPass(*PR);
initializeSIPreAllocateWWMRegsLegacyPass(*PR);
initializeSIFormMemoryClausesLegacyPass(*PR);
@@ -2151,7 +2151,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
addPass(GCNCreateVOPDPass());
}
- // TODO: addPass(SIMemoryLegalizerPass());
+
+ addPass(SIMemoryLegalizerPass());
// TODO: addPass(SIInsertWaitcntsPass());
// TODO: addPass(SIModeRegisterPass());
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 34953f9c08db7..1375ba201ec58 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -21,8 +21,10 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -625,9 +627,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
}
};
-class SIMemoryLegalizer final : public MachineFunctionPass {
+class SIMemoryLegalizer final {
private:
-
+ const MachineModuleInfo &MMI;
/// Cache Control.
std::unique_ptr<SICacheControl> CC = nullptr;
@@ -661,10 +663,16 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
+public:
+ SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
+ bool run(MachineFunction &MF);
+};
+
+class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
public:
static char ID;
- SIMemoryLegalizer() : MachineFunctionPass(ID) {}
+ SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -2767,11 +2775,26 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
-bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
- bool Changed = false;
-
+bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
const MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ return SIMemoryLegalizer(MMI).run(MF);
+}
+
+PreservedAnalyses
+SIMemoryLegalizerPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto *MMI = MFAM.getResult<ModuleAnalysisManagerFunctionProxy>(MF)
+ .getCachedResult<MachineModuleAnalysis>(
+ *MF.getFunction().getParent());
+ assert(MMI && "MachineModuleAnalysis must be available");
+ if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
+ return PreservedAnalyses::all();
+ return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
+}
+
+bool SIMemoryLegalizer::run(MachineFunction &MF) {
+ bool Changed = false;
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
@@ -2812,11 +2835,11 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
-INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
-char SIMemoryLegalizer::ID = 0;
-char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
+char SIMemoryLegalizerLegacy::ID = 0;
+char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
FunctionPass *llvm::createSIMemoryLegalizerPass() {
- return new SIMemoryLegalizer();
+ return new SIMemoryLegalizerLegacy();
}
>From 11b7833df74f3d2dd933a28b69a5dcf86c041b21 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Mar 2025 04:41:08 +0000
Subject: [PATCH 5/9] [AMDGPU][NPM] Port SIInsertWaitcnts to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 91 +++++++++++++------
llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir | 1 +
.../CodeGen/AMDGPU/insert-waitcnts-hang.mir | 1 +
.../AMDGPU/vccz-corrupt-bug-workaround.mir | 2 +
7 files changed, 76 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4197a60e77014..4dcfaf9b12b5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -371,6 +371,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> {
static bool isRequired() { return true; }
};
+class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -447,7 +454,7 @@ extern char &AMDGPUInsertDelayAluID;
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;
-void initializeSIInsertWaitcntsPass(PassRegistry&);
+void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
extern char &SIInsertWaitcntsID;
void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index de959f8a2aa62..c4641cba60e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbe212ad0a216..c3cc1dc6e495b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
initializeSIInsertHardClausesPass(*PR);
- initializeSIInsertWaitcntsPass(*PR);
+ initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
@@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
}
addPass(SIMemoryLegalizerPass());
- // TODO: addPass(SIInsertWaitcntsPass());
+ addPass(SIInsertWaitcntsPass());
// TODO: addPass(SIModeRegisterPass());
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee263f58bcaf2..8951a4144bd68 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -33,6 +33,7 @@
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
-class SIInsertWaitcnts : public MachineFunctionPass {
+class SIInsertWaitcnts {
private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
@@ -633,9 +634,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
public:
- static char ID;
-
- SIInsertWaitcnts() : MachineFunctionPass(ID) {
+ SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+ AliasAnalysis *AA)
+ : MLI(MLI), PDT(PDT), AA(AA) {
(void)ForceExpCounter;
(void)ForceLgkmCounter;
(void)ForceVMCounter;
@@ -645,20 +646,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool isPreheaderToFlush(MachineBasicBlock &MBB,
WaitcntBrackets &ScoreBrackets);
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert wait instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<MachineLoopInfoWrapperPass>();
- AU.addRequired<MachinePostDominatorTreeWrapperPass>();
- AU.addUsedIfAvailable<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
+ bool run(MachineFunction &MF);
bool isForceEmitWaitcnt() const {
for (auto T : inst_counter_types())
@@ -742,6 +730,36 @@ class SIInsertWaitcnts : public MachineFunctionPass {
WaitcntBrackets &ScoreBrackets);
};
+class SIInsertWaitcntsLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ auto *PDT =
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+ AliasAnalysis *AA = nullptr;
+ if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAR->getAAResults();
+
+ return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
+ }
+
+ StringRef getPassName() const override {
+ return "SI insert wait instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // end anonymous namespace
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
@@ -1124,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
return hasMixedPendingEvents(T);
}
-INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
+INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
+INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+ false, false)
-char SIInsertWaitcnts::ID = 0;
+char SIInsertWaitcntsLegacy::ID = 0;
-char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
FunctionPass *llvm::createSIInsertWaitcntsPass() {
- return new SIInsertWaitcnts();
+ return new SIInsertWaitcntsLegacy();
}
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
@@ -2406,16 +2424,29 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
}
-bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+SIInsertWaitcntsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+ auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
+ auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+ .getManager()
+ .getCachedResult<AAManager>(MF.getFunction());
+
+ if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
+ return PreservedAnalyses::all();
+
+ return getMachineFunctionPassPreservedAnalyses()
+ .preserveSet<CFGAnalyses>()
+ .preserve<AAManager>();
+}
+
+bool SIInsertWaitcnts::run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
- if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
- AA = &AAR->getAAResults();
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
index b6dc75db3edc1..0456d5cc463f1 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
# $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
# that the return address is not clobbered in the callee by the outstanding load.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
index 28d79efc00b0d..2834ca5fa6858 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s
---
name: test
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 17e3d93ed393b..f5321591a3c88 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -2,6 +2,8 @@
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+
+# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
---
# CHECK-LABEL: name: vccz_corrupt_workaround
# CHECK: $vcc = V_CMP_EQ_F32
>From 79a590f31ec8f979790ff770790a4f938fe67498 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Mar 2025 04:52:38 +0000
Subject: [PATCH 6/9] [AMDGPU][NPM] Port SIInsertHardClauses to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 8 ++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
.../lib/Target/AMDGPU/SIInsertHardClauses.cpp | 50 +++++++++++++------
.../CodeGen/AMDGPU/hard-clauses-img-gfx10.mir | 1 +
.../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir | 1 +
6 files changed, 46 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4dcfaf9b12b5e..b434676f85581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -378,6 +378,12 @@ class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
static bool isRequired() { return true; }
};
+class SIInsertHardClausesPass : public PassInfoMixin<SIInsertHardClausesPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -451,7 +457,7 @@ extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
-void initializeSIInsertHardClausesPass(PassRegistry &);
+void initializeSIInsertHardClausesLegacyPass(PassRegistry &);
extern char &SIInsertHardClausesID;
void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index c4641cba60e53..3eabe087a8a33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
@@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c3cc1dc6e495b..6c24fe5f1441a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -534,7 +534,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
- initializeSIInsertHardClausesPass(*PR);
+ initializeSIInsertHardClausesLegacyPass(*PR);
initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index dcc60765cc203..71b937f23cc3c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -36,6 +36,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
using namespace llvm;
@@ -89,18 +90,10 @@ enum HardClauseType {
HARDCLAUSE_ILLEGAL,
};
-class SIInsertHardClauses : public MachineFunctionPass {
+class SIInsertHardClauses {
public:
- static char ID;
const GCNSubtarget *ST = nullptr;
- SIInsertHardClauses() : MachineFunctionPass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
HardClauseType getHardClauseType(const MachineInstr &MI) {
if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
@@ -189,9 +182,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
return true;
}
- bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(MF.getFunction()))
- return false;
+ bool run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
if (!ST->hasHardClauses())
@@ -265,11 +256,40 @@ class SIInsertHardClauses : public MachineFunctionPass {
}
};
+class SIInsertHardClausesLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ SIInsertHardClausesLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ return SIInsertHardClauses().run(MF);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // namespace
-char SIInsertHardClauses::ID = 0;
+PreservedAnalyses
+llvm::SIInsertHardClausesPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (!SIInsertHardClauses().run(MF))
+ return PreservedAnalyses::all();
+
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+char SIInsertHardClausesLegacy::ID = 0;
-char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID;
+char &llvm::SIInsertHardClausesID = SIInsertHardClausesLegacy::ID;
-INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses",
+INITIALIZE_PASS(SIInsertHardClausesLegacy, DEBUG_TYPE, "SI Insert Hard Clauses",
false, false)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
index 50eea4aebd5e9..1baceeef82c92 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -passes si-insert-hard-clauses %s -o - | FileCheck %s
---
name: mimg_nsa
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index b22de06e68a7f..7505fde047782 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -passes si-insert-hard-clauses %s -o - | FileCheck %s
---
name: mimg_nsa
>From 70a3b5827ef2a8579ea6218cf9f797ffd3d79404 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Mar 2025 05:26:49 +0000
Subject: [PATCH 7/9] [AMDGPU][NPM] Port SILateBranchLowering to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 10 ++++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++-
.../Target/AMDGPU/SILateBranchLowering.cpp | 40 ++++++++++++++-----
llvm/test/CodeGen/AMDGPU/early-term.mir | 2 +
llvm/test/CodeGen/AMDGPU/readlane_exec0.mir | 1 +
6 files changed, 46 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b434676f85581..d1dc62e9cc526 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -213,7 +213,7 @@ extern char &SILowerControlFlowLegacyID;
void initializeSIPreEmitPeepholePass(PassRegistry &);
extern char &SIPreEmitPeepholeID;
-void initializeSILateBranchLoweringPass(PassRegistry &);
+void initializeSILateBranchLoweringLegacyPass(PassRegistry &);
extern char &SILateBranchLoweringPassID;
void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &);
@@ -384,6 +384,14 @@ class SIInsertHardClausesPass : public PassInfoMixin<SIInsertHardClausesPass> {
MachineFunctionAnalysisManager &MFAM);
};
+class SILateBranchLoweringPass
+ : public PassInfoMixin<SILateBranchLoweringPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 3eabe087a8a33..318aad5590cda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
+MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -132,7 +133,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
// already exists.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6c24fe5f1441a..b9d62cc9e4b63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -540,7 +540,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
- initializeSILateBranchLoweringPass(*PR);
+ initializeSILateBranchLoweringLegacyPass(*PR);
initializeSIMemoryLegalizerLegacyPass(*PR);
initializeSIOptimizeExecMaskingLegacyPass(*PR);
initializeSIPreAllocateWWMRegsLegacyPass(*PR);
@@ -2161,7 +2161,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
// TODO: addPass(SIInsertHardClausesPass());
}
- // addPass(SILateBranchLoweringPass());
+ addPass(SILateBranchLoweringPass());
+
if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) {
// TODO: addPass(AMDGPUSetWavePriorityPass());
}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index d02173f57ee37..0f5b6bd9374b0 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -16,6 +16,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachinePassManager.h"
using namespace llvm;
@@ -23,7 +24,7 @@ using namespace llvm;
namespace {
-class SILateBranchLowering : public MachineFunctionPass {
+class SILateBranchLowering {
private:
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
@@ -33,14 +34,23 @@ class SILateBranchLowering : public MachineFunctionPass {
void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
public:
- static char ID;
+ SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
+
+ bool run(MachineFunction &MF);
unsigned MovOpc;
Register ExecReg;
+};
- SILateBranchLowering() : MachineFunctionPass(ID) {}
+class SILateBranchLoweringLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ return SILateBranchLowering(MDT).run(MF);
+ }
StringRef getPassName() const override {
return "SI Final Branch Preparation";
@@ -55,15 +65,15 @@ class SILateBranchLowering : public MachineFunctionPass {
} // end anonymous namespace
-char SILateBranchLowering::ID = 0;
+char SILateBranchLoweringLegacy::ID = 0;
-INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
+INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
-char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID;
+char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID;
static void generateEndPgm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
@@ -144,11 +154,21 @@ void SILateBranchLowering::earlyTerm(MachineInstr &MI,
MDT->insertEdge(&MBB, EarlyExitBlock);
}
-bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+ if (!SILateBranchLowering(MDT).run(MF))
+ return PreservedAnalyses::all();
+
+ return getMachineFunctionPassPreservedAnalyses()
+ .preserve<MachineDominatorTreeAnalysis>();
+}
+
+bool SILateBranchLowering::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
index 77bc9729ee845..3d75d405a46d3 100644
--- a/llvm/test/CodeGen/AMDGPU/early-term.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -2,6 +2,8 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+
--- |
define amdgpu_ps void @early_term_scc0_end_block() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir
index 6a286eafa6d58..a4c05aa781df7 100644
--- a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir
+++ b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir
@@ -1,4 +1,5 @@
# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -run-pass=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -passes=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s
# GCN-LABEL: readlane_exec0
# GCN: bb.0
>From 192871bf259c7187ef1b9d2d9daec53470e73264 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Mar 2025 06:07:23 +0000
Subject: [PATCH 8/9] [AMDGPU][NPM] Port AMDGPUSetWavePriority to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../Target/AMDGPU/AMDGPUSetWavePriority.cpp | 44 ++++++++++++++-----
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +--
llvm/test/CodeGen/AMDGPU/set-wave-priority.ll | 5 +++
5 files changed, 48 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index d1dc62e9cc526..27ae6d42ec21d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -392,6 +392,13 @@ class SILateBranchLoweringPass
static bool isRequired() { return true; }
};
+class AMDGPUSetWavePriorityPass
+ : public PassInfoMixin<AMDGPUSetWavePriorityPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -504,7 +511,7 @@ void initializeGCNPreRAOptimizationsLegacyPass(PassRegistry &);
extern char &GCNPreRAOptimizationsID;
FunctionPass *createAMDGPUSetWavePriorityPass();
-void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+void initializeAMDGPUSetWavePriorityLegacyPass(PassRegistry &);
void initializeGCNRewritePartialRegUsesLegacyPass(llvm::PassRegistry &);
extern char &GCNRewritePartialRegUsesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 318aad5590cda..4956897d22fde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -100,6 +100,7 @@ MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
+MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
@@ -131,7 +132,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass())
#define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
-DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
index c16d33f1453c0..29aecda82bc4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -19,6 +19,7 @@
#include "SIInstrInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
using namespace llvm;
@@ -40,15 +41,11 @@ struct MBBInfo {
using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
-class AMDGPUSetWavePriority : public MachineFunctionPass {
+class AMDGPUSetWavePriority {
public:
static char ID;
- AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
-
- StringRef getPassName() const override { return "Set wave priority"; }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool run(MachineFunction &MF);
private:
MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB,
@@ -58,15 +55,30 @@ class AMDGPUSetWavePriority : public MachineFunctionPass {
const SIInstrInfo *TII;
};
+class AMDGPUSetWavePriorityLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUSetWavePriorityLegacy() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Set wave priority"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ return AMDGPUSetWavePriority().run(MF);
+ }
+};
+
} // End anonymous namespace.
-INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
- false)
+INITIALIZE_PASS(AMDGPUSetWavePriorityLegacy, DEBUG_TYPE, "Set wave priority",
+ false, false)
-char AMDGPUSetWavePriority::ID = 0;
+char AMDGPUSetWavePriorityLegacy::ID = 0;
FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
- return new AMDGPUSetWavePriority();
+ return new AMDGPUSetWavePriorityLegacy();
}
MachineInstr *
@@ -96,12 +108,20 @@ static bool isVMEMLoad(const MachineInstr &MI) {
return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
}
-bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+llvm::AMDGPUSetWavePriorityPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (!AMDGPUSetWavePriority().run(MF))
+ return PreservedAnalyses::all();
+ return getMachineFunctionPassPreservedAnalyses();
+}
+
+bool AMDGPUSetWavePriority::run(MachineFunction &MF) {
const unsigned HighPriority = 3;
const unsigned LowPriority = 0;
Function &F = MF.getFunction();
- if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
return false;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b9d62cc9e4b63..857af30b348cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2163,9 +2163,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
addPass(SILateBranchLoweringPass());
- if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) {
- // TODO: addPass(AMDGPUSetWavePriorityPass());
- }
+ if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
+ addPass(AMDGPUSetWavePriorityPass());
if (TM.getOptLevel() > CodeGenOptLevel::None) {
// TODO: addPass(SIPreEmitPeepholePass());
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
index ab6877ac4e6d1..a27d1217031ca 100644
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -1,6 +1,11 @@
; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
; RUN: FileCheck %s
+; RUN: llc -mtriple=amdgcn -stop-after=si-late-branch-lowering -o - %s | \
+; RUN: llc -x mir -mtriple=amdgcn -passes=amdgpu-set-wave-priority -o - | \
+; RUN: llc -x mir -mtriple=amdgcn -start-after=si-late-branch-lowering -o - | \
+; RUN: FileCheck %s
+
; CHECK-LABEL: no_setprio:
; CHECK-NOT: s_setprio
; CHECK: ; return to shader part epilog
>From bedfcb1378d22a7d58cde2547fba71f7ff3f235f Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Mar 2025 06:20:13 +0000
Subject: [PATCH 9/9] [AMDGPU][NPM] Port SIPreEmitPeephole to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++++-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 ++---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 31 ++++++++++++++-----
.../AMDGPU/insert-handle-flat-vmem-ds.mir | 1 +
...ort-exec-branches-special-instructions.mir | 1 +
.../CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 1 +
7 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 27ae6d42ec21d..b8f5d85ef0b9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -210,7 +210,7 @@ extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowLegacyPass(PassRegistry &);
extern char &SILowerControlFlowLegacyID;
-void initializeSIPreEmitPeepholePass(PassRegistry &);
+void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &);
extern char &SIPreEmitPeepholeID;
void initializeSILateBranchLoweringLegacyPass(PassRegistry &);
@@ -392,6 +392,13 @@ class SILateBranchLoweringPass
static bool isRequired() { return true; }
};
+class SIPreEmitPeepholePass : public PassInfoMixin<SIPreEmitPeepholePass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
class AMDGPUSetWavePriorityPass
: public PassInfoMixin<AMDGPUSetWavePriorityPass> {
public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 4956897d22fde..f14499d0d3146 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -125,6 +125,7 @@ MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPr
MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass())
MACHINE_FUNCTION_PASS("si-post-ra-bundler", SIPostRABundlerPass())
MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass())
+MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass())
MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass())
#undef MACHINE_FUNCTION_PASS
@@ -133,7 +134,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass())
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
// already exists.
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 857af30b348cb..05eb609956199 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -539,7 +539,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
- initializeSIPreEmitPeepholePass(*PR);
+ initializeSIPreEmitPeepholeLegacyPass(*PR);
initializeSILateBranchLoweringLegacyPass(*PR);
initializeSIMemoryLegalizerLegacyPass(*PR);
initializeSIOptimizeExecMaskingLegacyPass(*PR);
@@ -2166,9 +2166,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
addPass(AMDGPUSetWavePriorityPass());
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- // TODO: addPass(SIPreEmitPeepholePass());
- }
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(SIPreEmitPeepholePass());
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 2bb70c138a50c..9db2118f2997b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
namespace {
-class SIPreEmitPeephole : public MachineFunctionPass {
+class SIPreEmitPeephole {
private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
@@ -40,24 +40,31 @@ class SIPreEmitPeephole : public MachineFunctionPass {
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
+public:
+ bool run(MachineFunction &MF);
+};
+
+class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
public:
static char ID;
- SIPreEmitPeephole() : MachineFunctionPass(ID) {
- initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
+ initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
}
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ return SIPreEmitPeephole().run(MF);
+ }
};
} // End anonymous namespace.
-INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
+INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
"SI peephole optimizations", false, false)
-char SIPreEmitPeephole::ID = 0;
+char SIPreEmitPeepholeLegacy::ID = 0;
-char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
+char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
// Match:
@@ -410,7 +417,15 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (!SIPreEmitPeephole().run(MF))
+ return PreservedAnalyses::all();
+ return getMachineFunctionPassPreservedAnalyses();
+}
+
+bool SIPreEmitPeephole::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
diff --git a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir
index d89f306c96a36..2e8c8ca9c7a6c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -passes si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
index 20de119471ba3..92a9a195fc4c7 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
# Make sure mandatory skips are not removed around mode defs.
---
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 796a70cfe8a39..1d0a6db36ea3b 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
---
name: simple
More information about the llvm-branch-commits
mailing list