[llvm] [AMDGPU] IGLP: Fix static variables (PR #137549)
Robert Imschweiler via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 27 14:09:32 PDT 2025
https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/137549
Replace global / class-level static variables with instance members and guarantee thread-safety.
@arsenm as discussed
Some notable implications of these changes:
- `MFMASmallGemmSingleWaveOpt::applyIGLPStrategy()`: no caching anymore between scheduling phases
- `MFMAExpInterleaveOpt::analyzeDAG()`: needs to be able to run on post-ra DAG as well, which especially affects the dependency handling
I tried to keep the changes as non-invasive as possible and without functional change.
>From 0f693669e4e9682f69907932e46e219c09d421c2 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Sun, 27 Apr 2025 15:27:29 -0500
Subject: [PATCH] [AMDGPU] IGLP: Fix static variables
Replace global / class-level static variables with instance members and
guarantee thread-safety.
---
llvm/include/llvm/CodeGen/ScheduleDAG.h | 7 +-
llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 6 +-
llvm/lib/CodeGen/ScheduleDAG.cpp | 12 +-
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 191 ++++++++----------
4 files changed, 106 insertions(+), 110 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
index 1c8d92d149adc..f57bc55bf3131 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -734,7 +734,8 @@ class TargetRegisterInfo;
/// Makes a DFS traversal and mark all nodes affected by the edge insertion.
/// These nodes will later get new topological indexes by means of the Shift
/// method.
- void DFS(const SUnit *SU, int UpperBound, bool& HasLoop);
+ void DFS(const SUnit *SU, int UpperBound, bool &HasLoop,
+ std::optional<SDep::Kind> OnlyDepKind = std::nullopt);
/// Reassigns topological indexes for the nodes in the DAG to
/// preserve the topological ordering.
@@ -767,7 +768,9 @@ class TargetRegisterInfo;
bool &Success);
/// Checks if \p SU is reachable from \p TargetSU.
- bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
+ /// If OnlyDepKind is given, consider only dependencies of this kind.
+ bool IsReachable(const SUnit *SU, const SUnit *TargetSU,
+ std::optional<SDep::Kind> OnlyDepKind = std::nullopt);
/// Returns true if addPred(TargetSU, SU) creates a cycle.
bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index e79b03c57a1e8..f1d8852c75005 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -287,8 +287,10 @@ namespace llvm {
}
/// IsReachable - Checks if SU is reachable from TargetSU.
- bool IsReachable(SUnit *SU, SUnit *TargetSU) {
- return Topo.IsReachable(SU, TargetSU);
+ /// If OnlyDepKind is given, only dependencies of this kind are considered.
+ bool IsReachable(SUnit *SU, SUnit *TargetSU,
+ std::optional<SDep::Kind> OnlyDepKind = std::nullopt) {
+ return Topo.IsReachable(SU, TargetSU, OnlyDepKind);
}
/// Whether regions with a single MI should be scheduled.
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index 26857edd871e2..dec508f2f763d 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -570,7 +570,8 @@ void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
}
void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
- bool &HasLoop) {
+ bool &HasLoop,
+ std::optional<SDep::Kind> OnlyDepKind) {
std::vector<const SUnit*> WorkList;
WorkList.reserve(SUnits.size());
@@ -580,6 +581,8 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
WorkList.pop_back();
Visited.set(SU->NodeNum);
for (const SDep &SuccDep : llvm::reverse(SU->Succs)) {
+ if (OnlyDepKind && SuccDep.getKind() != *OnlyDepKind)
+ continue;
unsigned s = SuccDep.getSUnit()->NodeNum;
// Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
if (s >= Node2Index.size())
@@ -722,8 +725,9 @@ void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) {
Visited.resize(Node2Index.size());
}
-bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
- const SUnit *TargetSU) {
+bool ScheduleDAGTopologicalSort::IsReachable(
+ const SUnit *SU, const SUnit *TargetSU,
+ std::optional<SDep::Kind> OnlyDepKind) {
assert(TargetSU != nullptr && "Invalid target SUnit");
assert(SU != nullptr && "Invalid SUnit");
FixOrder();
@@ -737,7 +741,7 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
if (LowerBound < UpperBound) {
Visited.reset();
// There may be a path from TargetSU to SU. Check for it.
- DFS(TargetSU, UpperBound, HasLoop);
+ DFS(TargetSU, UpperBound, HasLoop, OnlyDepKind);
}
return HasLoop;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index a2b7cd088093a..4e5650d18b35a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -229,13 +229,13 @@ class SchedGroup {
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
: SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
- SGID = NumSchedGroups++;
+ SGID = __atomic_fetch_add(&NumSchedGroups, 1, __ATOMIC_SEQ_CST);
}
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
: SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
- SGID = NumSchedGroups++;
+ SGID = __atomic_fetch_add(&NumSchedGroups, 1, __ATOMIC_SEQ_CST);
}
};
@@ -887,26 +887,26 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
class MFMAExpInterleaveOpt final : public IGLPStrategy {
private:
// The count of TRANS SUs involved in the interleaved pipeline
- static unsigned TransPipeCount;
+ unsigned TransPipeCount = 0;
// The count of MFMA SUs involved in the interleaved pipeline
- static unsigned MFMAPipeCount;
+ unsigned MFMAPipeCount = 0;
// The count of Add SUs involved in the interleaved pipeline
- static unsigned AddPipeCount;
+ unsigned AddPipeCount = 0;
// The number of transitive MFMA successors for each TRANS SU
- static unsigned MFMAEnablement;
+ unsigned MFMAEnablement = 0;
// The number of transitive TRANS predecessors for each MFMA SU
- static unsigned ExpRequirement;
+ unsigned ExpRequirement = 0;
// The count of independent "chains" of MFMA instructions in the pipeline
- static unsigned MFMAChains;
+ unsigned MFMAChains = 0;
// The length of each independent "chain" of MFMA instructions
- static unsigned MFMAChainLength;
+ unsigned MFMAChainLength = 0;
// Whether or not the pipeline has V_CVT instructions
- static bool HasCvt;
+ bool HasCvt = false;
// Whether or not there are instructions between the TRANS instruction and
// V_CVT
- static bool HasChainBetweenCvt;
+ bool HasChainBetweenCvt = false;
// The first occuring DS_READ which feeds an MFMA chain
- static std::optional<unsigned> FirstPipeDSR;
+ std::optional<unsigned> FirstPipeDSR = std::nullopt;
// The MFMAPipe SUs with no MFMA predecessors
SmallVector<SUnit *, 4> MFMAChainSeeds;
// Compute the heuristics for the pipeline, returning whether or not the DAG
@@ -1325,17 +1325,8 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
}
};
-unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
-unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
-bool MFMAExpInterleaveOpt::HasCvt = false;
-bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
-std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
-
+// Note: we only want to check for true (data) dependencies (see SDep::Kind)
+// so that the logic also works in the PostRA phase.
bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
SmallVector<SUnit *, 10> ExpPipeCands;
SmallVector<SUnit *, 10> MFMAPipeCands;
@@ -1353,17 +1344,25 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
+ // Heuristic helper function (see below)
+ auto IsFMACDataDep = [](SDep &Dep) {
+ return Dep.getKind() == SDep::Kind::Data &&
+ Dep.getSUnit()->getInstr()->getOpcode() == AMDGPU::V_FMAC_F32_e32;
+ };
+
AddPipeCount = 0;
for (SUnit &SU : DAG->SUnits) {
auto Opc = SU.getInstr()->getOpcode();
if (TII->isTRANS(Opc)) {
// Avoid counting a potential bonus V_EXP which all the MFMA depend on
- if (SU.Succs.size() >= 7)
+ // FIXME: This heuristic needs improvement/clarification!
+ // In general, the pipeline seems to look like this:
+ // fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
+ // (with potential arithmetic between exp and cvt)
+ // see
+ // https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
+ if (SU.Succs.size() >= 7 && any_of(SU.Succs, IsFMACDataDep))
continue;
- for (auto &Succ : SU.Succs) {
- if (Succ.getSUnit()->Succs.size() >= 7)
- continue;
- }
ExpPipeCands.push_back(&SU);
}
@@ -1390,7 +1389,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
// Count the number of EXPs that reach an MFMA
for (auto &PredSU : ExpPipeCands) {
for (auto &SuccSU : MFMAPipeCands) {
- if (DAG->IsReachable(SuccSU, PredSU)) {
+ if (DAG->IsReachable(SuccSU, PredSU, SDep::Kind::Data)) {
if (!TempExp) {
TempExp = PredSU;
TempMFMA = SuccSU;
@@ -1418,7 +1417,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
continue;
for (auto &PredSU : ExpPipeCands) {
- if (DAG->IsReachable(SuccSU, PredSU)) {
+ if (DAG->IsReachable(SuccSU, PredSU, SDep::Kind::Data)) {
MFMAPipeSUs.push_back(SuccSU);
break;
}
@@ -1432,7 +1431,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
std::optional<SUnit *> TempCvt;
for (auto &SuccSU : CvtSUs) {
- if (DAG->IsReachable(SuccSU, *TempExp)) {
+ if (DAG->IsReachable(SuccSU, *TempExp, SDep::Kind::Data)) {
TempCvt = SuccSU;
break;
}
@@ -1441,13 +1440,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
HasCvt = false;
if (TempCvt.has_value()) {
for (auto &SuccSU : MFMAPipeSUs) {
- if (DAG->IsReachable(SuccSU, *TempCvt)) {
+ if (DAG->IsReachable(SuccSU, *TempCvt, SDep::Kind::Data)) {
HasCvt = true;
break;
}
}
}
+ MFMAChainSeeds.clear();
MFMAChains = 0;
for (auto &MFMAPipeSU : MFMAPipeSUs) {
if (is_contained(MFMAChainSeeds, MFMAPipeSU))
@@ -1474,7 +1474,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
// The number of bit pack operations that depend on a single V_EXP
unsigned PackSuccCount =
llvm::count_if(PackSUs, [this, &TempExp](SUnit *VPack) {
- return DAG->IsReachable(VPack, *TempExp);
+ return DAG->IsReachable(VPack, *TempExp, SDep::Kind::Data);
});
// The number of bit pack operations an MFMA depends on
@@ -1504,10 +1504,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
MFMAEnablement *= PackSuccCount;
// The number of V_EXPs required to resolve all dependencies for an MFMA
- ExpRequirement =
- llvm::count_if(ExpPipeCands, [this, &PackPred](SUnit *ExpBase) {
- return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
- });
+ ExpRequirement = llvm::count_if(ExpPipeCands, [this,
+ &PackPred](SUnit *ExpBase) {
+ return DAG->IsReachable(PackPred->getSUnit(), ExpBase, SDep::Kind::Data);
+ });
ExpRequirement *= PackPredCount;
return true;
@@ -1518,12 +1518,7 @@ bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- if (Phase != AMDGPU::SchedulingPhase::PostRA)
- MFMAChainSeeds.clear();
- if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
- return false;
-
- return true;
+ return analyzeDAG(TII);
}
bool MFMAExpInterleaveOpt::applyIGLPStrategy(
@@ -1550,18 +1545,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned CurrMFMAForTransPosition = 0;
auto incrementTransPosition = [&MFMAChain, &PositionInChain,
- &CurrMFMAForTransPosition]() {
+ &CurrMFMAForTransPosition, this]() {
CurrMFMAForTransPosition += MFMAEnablement;
PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
MFMAChain = CurrMFMAForTransPosition % MFMAChains;
};
- auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+ auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return (TempMFMAForTrans / MFMAChains);
};
- auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+ auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return TempMFMAForTrans % MFMAChains;
};
@@ -1571,7 +1566,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned PositionInChainForMFMA = 0;
auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
- &PositionInChainForMFMA]() {
+ &PositionInChainForMFMA, this]() {
++CurrMFMAPosition;
MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
@@ -2062,22 +2057,16 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
};
-static unsigned DSWCount = 0;
-static unsigned DSWWithPermCount = 0;
-static unsigned DSWWithSharedVMEMCount = 0;
-
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
AMDGPU::SchedulingPhase Phase) {
unsigned MFMACount = 0;
unsigned DSRCount = 0;
+ unsigned DSWCount = 0;
+ unsigned DSWWithPermCount = 0;
+ unsigned DSWWithSharedVMEMCount = 0;
- bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
-
- assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
- DSWWithSharedVMEMCount == 0)) &&
- "DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto *I = SU.getInstr();
@@ -2086,7 +2075,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
- else if (I->mayStore() && IsInitial) {
+ else if (I->mayStore()) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -2099,58 +2088,56 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}
- if (IsInitial) {
- DSWWithPermCount = DSWithPerms.size();
- auto *I = DSWithPerms.begin();
- auto *E = DSWithPerms.end();
-
- // Get the count of DS_WRITES with V_PERM predecessors which
- // have loop carried dependencies (WAR) on the same VMEM_READs.
- // We consider partial overlap as a miss -- in other words,
- // for a given DS_W, we only consider another DS_W as matching
- // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
- // for every V_PERM pred of this DS_W.
- DenseMap<MachineInstr *, SUnit *> VMEMLookup;
- SmallVector<SUnit *, 6> Counted;
- for (; I != E; I++) {
- SUnit *Cand = nullptr;
- bool MissedAny = false;
- for (auto &Pred : (*I)->Preds) {
- if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
- continue;
+ DSWWithPermCount = DSWithPerms.size();
+ auto *I = DSWithPerms.begin();
+ auto *E = DSWithPerms.end();
+
+ // Get the count of DS_WRITES with V_PERM predecessors which
+ // have loop carried dependencies (WAR) on the same VMEM_READs.
+ // We consider partial overlap as a miss -- in other words,
+ // for a given DS_W, we only consider another DS_W as matching
+ // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+ // for every V_PERM pred of this DS_W.
+ DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+ SmallVector<SUnit *, 6> Counted;
+ for (; I != E; I++) {
+ SUnit *Cand = nullptr;
+ bool MissedAny = false;
+ for (auto &Pred : (*I)->Preds) {
+ if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
+ continue;
- if (Cand && llvm::is_contained(Counted, Cand))
- break;
+ if (Cand && llvm::is_contained(Counted, Cand))
+ break;
- for (auto &Succ : Pred.getSUnit()->Succs) {
- auto *MI = Succ.getSUnit()->getInstr();
- if (!TII->isVMEM(*MI) || !MI->mayLoad())
- continue;
+ for (auto &Succ : Pred.getSUnit()->Succs) {
+ auto *MI = Succ.getSUnit()->getInstr();
+ if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ continue;
- if (MissedAny || !VMEMLookup.size()) {
- MissedAny = true;
- VMEMLookup[MI] = *I;
- continue;
- }
+ if (MissedAny || !VMEMLookup.size()) {
+ MissedAny = true;
+ VMEMLookup[MI] = *I;
+ continue;
+ }
- auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
- if (Inserted) {
- MissedAny = true;
- continue;
- }
+ auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
+ if (Inserted) {
+ MissedAny = true;
+ continue;
+ }
- Cand = It->second;
- if (llvm::is_contained(Counted, Cand)) {
- MissedAny = true;
- break;
- }
+ Cand = It->second;
+ if (llvm::is_contained(Counted, Cand)) {
+ MissedAny = true;
+ break;
}
}
- if (!MissedAny && Cand) {
- DSWWithSharedVMEMCount += 2;
- Counted.push_back(Cand);
- Counted.push_back(*I);
- }
+ }
+ if (!MissedAny && Cand) {
+ DSWWithSharedVMEMCount += 2;
+ Counted.push_back(Cand);
+ Counted.push_back(*I);
}
}
More information about the llvm-commits
mailing list