[llvm] [AMDGPU] IGLP: Fix static variables (PR #137549)
Robert Imschweiler via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 26 07:15:50 PST 2025
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/137549
>From f67dbe3da7b601e1cd1fb02b081fb259842560bb Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 26 Nov 2025 09:14:11 -0600
Subject: [PATCH] [AMDGPU] IGLP: Fix static variables
Replace global / class-level static variables with instance members and
guarantee thread-safety.
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 210 ++++++++++++------
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 60 ++++-
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 28 +++
3 files changed, 231 insertions(+), 67 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 85addb13aef8d..e2673566f2653 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -821,6 +821,8 @@ class IGLPStrategy {
const SIInstrInfo *TII;
+ const MachineInstr *IGLPOptMI;
+
public:
/// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
virtual bool applyIGLPStrategy(
@@ -834,8 +836,9 @@ class IGLPStrategy {
bool IsBottomUp = true;
- IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : DAG(DAG), TII(TII) {}
+ IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+ const MachineInstr *IGLPOptMI)
+ : DAG(DAG), TII(TII), IGLPOptMI(IGLPOptMI) {}
virtual ~IGLPStrategy() = default;
};
@@ -853,8 +856,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
return true;
}
- MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : IGLPStrategy(DAG, TII) {
+ MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+ const MachineInstr *IGLPOptMI)
+ : IGLPStrategy(DAG, TII, IGLPOptMI) {
IsBottomUp = true;
}
};
@@ -887,31 +891,36 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
class MFMAExpInterleaveOpt final : public IGLPStrategy {
private:
// The count of TRANS SUs involved in the interleaved pipeline
- static unsigned TransPipeCount;
+ unsigned TransPipeCount = 0;
// The count of MFMA SUs involved in the interleaved pipeline
- static unsigned MFMAPipeCount;
+ unsigned MFMAPipeCount = 0;
// The count of Add SUs involved in the interleaved pipeline
- static unsigned AddPipeCount;
+ unsigned AddPipeCount = 0;
// The number of transitive MFMA successors for each TRANS SU
- static unsigned MFMAEnablement;
+ unsigned MFMAEnablement = 0;
// The number of transitive TRANS predecessors for each MFMA SU
- static unsigned ExpRequirement;
+ unsigned ExpRequirement = 0;
// The count of independent "chains" of MFMA instructions in the pipeline
- static unsigned MFMAChains;
+ unsigned MFMAChains = 0;
// The length of each independent "chain" of MFMA instructions
- static unsigned MFMAChainLength;
+ unsigned MFMAChainLength = 0;
// Whether or not the pipeline has V_CVT instructions
- static bool HasCvt;
+ bool HasCvt = false;
// Whether or not there are instructions between the TRANS instruction and
// V_CVT
- static bool HasChainBetweenCvt;
+ bool HasChainBetweenCvt = false;
// The first occuring DS_READ which feeds an MFMA chain
- static std::optional<unsigned> FirstPipeDSR;
+ std::optional<unsigned> FirstPipeDSR = std::nullopt;
// The MFMAPipe SUs with no MFMA predecessors
SmallVector<SUnit *, 4> MFMAChainSeeds;
// Compute the heuristics for the pipeline, returning whether or not the DAG
// is well formatted for the mutation
- bool analyzeDAG(const SIInstrInfo *TII);
+ bool analyzeDAG(const SIInstrInfo *TII, AMDGPU::SchedulingPhase Phase);
+ bool computeDAGAnalysis(const SIInstrInfo *TII,
+ SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+ void initializeFromCache(
+ const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+ bool AnalysisResult;
/// Whether or not the instruction is a transitive predecessor of an MFMA
/// instruction
@@ -1316,29 +1325,22 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
AMDGPU::SchedulingPhase Phase) override;
- MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : IGLPStrategy(DAG, TII) {
+ MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+ const MachineInstr *IGLPOptMI)
+ : IGLPStrategy(DAG, TII, IGLPOptMI) {
IsBottomUp = false;
}
};
-unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
-unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
-bool MFMAExpInterleaveOpt::HasCvt = false;
-bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
-std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
-
-bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
+bool MFMAExpInterleaveOpt::computeDAGAnalysis(
+ const SIInstrInfo *TII,
+ SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
SmallVector<SUnit *, 10> ExpPipeCands;
SmallVector<SUnit *, 10> MFMAPipeCands;
SmallVector<SUnit *, 10> MFMAPipeSUs;
SmallVector<SUnit *, 10> PackSUs;
SmallVector<SUnit *, 10> CvtSUs;
+ const MachineInstr *FirstPipeDSRInstr = nullptr;
auto isBitPack = [](unsigned Opc) {
return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
@@ -1355,12 +1357,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
auto Opc = SU.getInstr()->getOpcode();
if (TII->isTRANS(Opc)) {
// Avoid counting a potential bonus V_EXP which all the MFMA depend on
+ // FIXME: This heuristic needs improvement/clarification!
+ // In general, the pipeline seems to look like this:
+ // fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
+ // (with potential arithmetic between exp and cvt)
+ // see
+ // https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
if (SU.Succs.size() >= 7)
continue;
- for (auto &Succ : SU.Succs) {
- if (Succ.getSUnit()->Succs.size() >= 7)
- continue;
- }
ExpPipeCands.push_back(&SU);
}
@@ -1445,6 +1449,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
}
}
+ MFMAChainSeeds.clear();
MFMAChains = 0;
for (auto &MFMAPipeSU : MFMAPipeSUs) {
if (is_contained(MFMAChainSeeds, MFMAPipeSU))
@@ -1462,8 +1467,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
for (auto Pred : MFMAChainSeeds[0]->Preds) {
if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
- Pred.getSUnit()->getInstr()->mayLoad())
+ Pred.getSUnit()->getInstr()->mayLoad()) {
FirstPipeDSR = Pred.getSUnit()->NodeNum;
+ FirstPipeDSRInstr = Pred.getSUnit()->getInstr();
+ }
}
MFMAChainLength = MFMAPipeCount / MFMAChains;
@@ -1507,20 +1514,76 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
});
ExpRequirement *= PackPredCount;
+
+ // Cache the results for later scheduling phases.
+ Cache.TransPipeCount = TransPipeCount;
+ Cache.MFMAPipeCount = MFMAPipeCount;
+ Cache.AddPipeCount = AddPipeCount;
+ Cache.MFMAEnablement = MFMAEnablement;
+ Cache.ExpRequirement = ExpRequirement;
+ Cache.MFMAChains = MFMAChains;
+ Cache.MFMAChainLength = MFMAChainLength;
+ Cache.HasCvt = HasCvt;
+ Cache.HasChainBetweenCvt = HasChainBetweenCvt;
+ Cache.FirstPipeDSRInstr = FirstPipeDSRInstr;
+ Cache.MFMAChainSeedInstrs.clear();
+ Cache.MFMAChainSeedInstrs.reserve(MFMAChainSeeds.size());
+ for (SUnit *Seed : MFMAChainSeeds)
+ Cache.MFMAChainSeedInstrs.push_back(Seed->getInstr());
+ Cache.AnalysisResult = true;
+
return true;
}
+void MFMAExpInterleaveOpt::initializeFromCache(
+ const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
+ TransPipeCount = Cache.TransPipeCount;
+ MFMAPipeCount = Cache.MFMAPipeCount;
+ AddPipeCount = Cache.AddPipeCount;
+ MFMAEnablement = Cache.MFMAEnablement;
+ ExpRequirement = Cache.ExpRequirement;
+ MFMAChains = Cache.MFMAChains;
+ MFMAChainLength = Cache.MFMAChainLength;
+ HasCvt = Cache.HasCvt;
+ HasChainBetweenCvt = Cache.HasChainBetweenCvt;
+ SUnit *SU =
+ DAG->getSUnit(const_cast<MachineInstr *>(Cache.FirstPipeDSRInstr));
+ assert(SU && "FirstPipeDSRInstr instruction not found in DAG");
+ FirstPipeDSR = SU->NodeNum;
+ MFMAChainSeeds.clear();
+ for (const MachineInstr *MI : Cache.MFMAChainSeedInstrs) {
+ SUnit *SeedSU = DAG->getSUnit(const_cast<MachineInstr *>(MI));
+ assert(SeedSU && "MFMAChainSeed instruction not found in DAG");
+ MFMAChainSeeds.push_back(SeedSU);
+ }
+ AnalysisResult = Cache.AnalysisResult;
+}
+
+bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII,
+ AMDGPU::SchedulingPhase Phase) {
+ SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+
+ if (const SIMachineFunctionInfo::MFMAExpInterleaveCache *Cache =
+ MFI.getMFMAExpInterleaveCache(IGLPOptMI)) {
+ initializeFromCache(*Cache);
+ return AnalysisResult;
+ }
+
+ assert(Phase != AMDGPU::SchedulingPhase::PostRA &&
+ "PostRA phase not expected to require analyzing DAG");
+ SIMachineFunctionInfo::MFMAExpInterleaveCache Cache;
+ AnalysisResult = computeDAGAnalysis(TII, Cache);
+ if (AnalysisResult)
+ MFI.setMFMAExpInterleaveCache(IGLPOptMI, Cache);
+ return AnalysisResult;
+}
+
bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
AMDGPU::SchedulingPhase Phase) {
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- if (Phase != AMDGPU::SchedulingPhase::PostRA)
- MFMAChainSeeds.clear();
- if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
- return false;
-
- return true;
+ return analyzeDAG(TII, Phase);
}
bool MFMAExpInterleaveOpt::applyIGLPStrategy(
@@ -1528,6 +1591,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
AMDGPU::SchedulingPhase Phase) {
+ assert(AnalysisResult && "no or failed DAG analysis");
+
bool IsSmallKernelType =
MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
bool IsLargeKernelType =
@@ -1547,18 +1612,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned CurrMFMAForTransPosition = 0;
auto incrementTransPosition = [&MFMAChain, &PositionInChain,
- &CurrMFMAForTransPosition]() {
+ &CurrMFMAForTransPosition, this]() {
CurrMFMAForTransPosition += MFMAEnablement;
PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
MFMAChain = CurrMFMAForTransPosition % MFMAChains;
};
- auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+ auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return (TempMFMAForTrans / MFMAChains);
};
- auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+ auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
return TempMFMAForTrans % MFMAChains;
};
@@ -1568,7 +1633,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
unsigned PositionInChainForMFMA = 0;
auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
- &PositionInChainForMFMA]() {
+ &PositionInChainForMFMA, this]() {
++CurrMFMAPosition;
MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
@@ -1826,8 +1891,9 @@ class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
return true;
}
- MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : IGLPStrategy(DAG, TII) {
+ MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+ const MachineInstr *IGLPOptMI)
+ : IGLPStrategy(DAG, TII, IGLPOptMI) {
IsBottomUp = true;
}
};
@@ -2053,25 +2119,36 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
return true;
}
- MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : IGLPStrategy(DAG, TII) {
+ MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+ const MachineInstr *IGLPOptMI)
+ : IGLPStrategy(DAG, TII, IGLPOptMI) {
IsBottomUp = false;
}
};
-static unsigned DSWCount = 0;
-static unsigned DSWWithPermCount = 0;
-static unsigned DSWWithSharedVMEMCount = 0;
-
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
AMDGPU::SchedulingPhase Phase) {
unsigned MFMACount = 0;
unsigned DSRCount = 0;
+ unsigned DSWCount = 0;
+ unsigned DSWWithPermCount = 0;
+ unsigned DSWWithSharedVMEMCount = 0;
bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
+ if (!IsInitial) {
+ const SIMachineFunctionInfo &MFI =
+ *DAG->MF.getInfo<SIMachineFunctionInfo>();
+ const SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache *Cache =
+ MFI.getMFMASmallGemmSingleWaveCache(IGLPOptMI);
+ assert(Cache && "no cache found");
+ DSWCount = Cache->DSWCount;
+ DSWWithPermCount = Cache->DSWWithPermCount;
+ DSWWithSharedVMEMCount = Cache->DSWWithSharedVMEMCount;
+ }
+
assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
DSWWithSharedVMEMCount == 0)) &&
"DSWCounters should be zero in pre-RA scheduling!");
@@ -2098,8 +2175,6 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
if (IsInitial) {
DSWWithPermCount = DSWithPerms.size();
- auto *I = DSWithPerms.begin();
- auto *E = DSWithPerms.end();
// Get the count of DS_WRITES with V_PERM predecessors which
// have loop carried dependencies (WAR) on the same VMEM_READs.
@@ -2109,10 +2184,10 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// for every V_PERM pred of this DS_W.
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
SmallVector<SUnit *, 6> Counted;
- for (; I != E; I++) {
+ for (SUnit *DSWrite : DSWithPerms) {
SUnit *Cand = nullptr;
bool MissedAny = false;
- for (auto &Pred : (*I)->Preds) {
+ for (auto &Pred : DSWrite->Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;
@@ -2126,11 +2201,11 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
if (MissedAny || !VMEMLookup.size()) {
MissedAny = true;
- VMEMLookup[MI] = *I;
+ VMEMLookup[MI] = DSWrite;
continue;
}
- auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
+ auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite);
if (Inserted) {
MissedAny = true;
continue;
@@ -2146,9 +2221,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
if (!MissedAny && Cand) {
DSWWithSharedVMEMCount += 2;
Counted.push_back(Cand);
- Counted.push_back(*I);
+ Counted.push_back(DSWrite);
}
}
+
+ SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache Cache = {
+ DSWCount, DSWWithPermCount, DSWWithSharedVMEMCount};
+ SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+ MFI.setMFMASmallGemmSingleWaveCache(IGLPOptMI, Cache);
}
assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
@@ -2310,16 +2390,16 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
- const SIInstrInfo *TII) {
+ const SIInstrInfo *TII, const MachineInstr *IGLPOptMI) {
switch (ID) {
case MFMASmallGemmOptID:
- return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+ return std::make_unique<MFMASmallGemmOpt>(DAG, TII, IGLPOptMI);
case MFMASmallGemmSingleWaveOptID:
- return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+ return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII, IGLPOptMI);
case MFMAExpInterleaveID:
- return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
+ return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII, IGLPOptMI);
case MFMAExpSimpleInterleaveID:
- return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+ return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII, IGLPOptMI);
}
llvm_unreachable("Unknown IGLPStrategyID");
@@ -2671,7 +2751,7 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
- auto S = createIGLPStrategy(StrategyID, DAG, TII);
+ auto S = createIGLPStrategy(StrategyID, DAG, TII, SU.getInstr());
if (!S->shouldApplyStrategy(DAG, Phase))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index d901f4c216551..96347042fd4f6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -20,6 +20,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
@@ -29,6 +30,8 @@
namespace llvm {
+class MachineInstr;
+
class MachineFrameInfo;
class MachineFunction;
class SIMachineFunctionInfo;
@@ -552,6 +555,27 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
bool IsDead = false;
};
+ struct MFMASmallGemmSingleWaveCache {
+ unsigned DSWCount = 0;
+ unsigned DSWWithPermCount = 0;
+ unsigned DSWWithSharedVMEMCount = 0;
+ };
+
+ struct MFMAExpInterleaveCache {
+ SmallVector<const MachineInstr *, 4> MFMAChainSeedInstrs;
+ const MachineInstr *FirstPipeDSRInstr = nullptr;
+ unsigned TransPipeCount = 0;
+ unsigned MFMAPipeCount = 0;
+ unsigned AddPipeCount = 0;
+ unsigned MFMAEnablement = 0;
+ unsigned ExpRequirement = 0;
+ unsigned MFMAChains = 0;
+ unsigned MFMAChainLength = 0;
+ bool HasCvt = false;
+ bool HasChainBetweenCvt = false;
+ bool AnalysisResult = false;
+ };
+
private:
// To track virtual VGPR + lane index for each subregister of the SGPR spilled
// to frameindex key during SILowerSGPRSpills pass.
@@ -616,7 +640,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// load/store is enabled.
IndexedMap<uint32_t, VGPRBlock2IndexFunctor> MaskForVGPRBlockOps;
-private:
+ DenseMap<const MachineInstr *, MFMASmallGemmSingleWaveCache>
+ MFMASmallGemmSingleWaveCaches;
+ DenseMap<const MachineInstr *, MFMAExpInterleaveCache>
+ MFMAExpInterleaveCaches;
+
Register VGPRForAGPRCopy;
bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
@@ -634,6 +662,35 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
VGPRForAGPRCopy = NewVGPRForAGPRCopy;
}
+ const MFMASmallGemmSingleWaveCache *
+ getMFMASmallGemmSingleWaveCache(const MachineInstr *MI) const {
+ if (!MI)
+ return nullptr;
+ auto It = MFMASmallGemmSingleWaveCaches.find(MI);
+ return It == MFMASmallGemmSingleWaveCaches.end() ? nullptr : &It->second;
+ }
+
+ void
+ setMFMASmallGemmSingleWaveCache(const MachineInstr *MI,
+ const MFMASmallGemmSingleWaveCache &Cache) {
+ assert(MI && "MachineInstr pointer must not be null");
+ MFMASmallGemmSingleWaveCaches[MI] = Cache;
+ }
+
+ const MFMAExpInterleaveCache *
+ getMFMAExpInterleaveCache(const MachineInstr *MI) const {
+ if (!MI)
+ return nullptr;
+ auto It = MFMAExpInterleaveCaches.find(MI);
+ return It == MFMAExpInterleaveCaches.end() ? nullptr : &It->second;
+ }
+
+ void setMFMAExpInterleaveCache(const MachineInstr *MI,
+ const MFMAExpInterleaveCache &Cache) {
+ assert(MI && "MachineInstr pointer must not be null");
+ MFMAExpInterleaveCaches[MI] = Cache;
+ }
+
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const;
void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) {
@@ -649,7 +706,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return MaskForVGPRBlockOps.inBounds(RegisterBlock);
}
-public:
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 750f72e775a95..74c7caa38d746 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,34 @@ entry:
ret void
}
+; If we run this function after test_iglp_opt_rev_mfma_gemm, we get:
+; > Assertion `(!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+; > DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA
+; > scheduling!"' failed.
+; This is because, previously, the counters were global static variables which
+; weren't reset.
+define amdgpu_kernel void @test_after_test_iglp_opt_rev_mfma_gemm(ptr %src, ptr addrspace(3) %dst) {
+; GCN-LABEL: test_after_test_iglp_opt_rev_mfma_gemm:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: ; iglp_opt mask(0x00000001)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: flat_load_ubyte v0, v[0:1]
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: ds_write_b8 v1, v0
+; GCN-NEXT: s_endpgm
+entry:
+ %a = load i1, ptr %src, align 1
+ call void @llvm.amdgcn.iglp.opt(i32 1)
+ store i1 %a, ptr addrspace(3) %dst, align 1
+ ret void
+}
+
define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
; GCN-LABEL: test_iglp_opt_asm_sideeffect:
; GCN: ; %bb.0: ; %entry
More information about the llvm-commits
mailing list