[llvm] [AMDGPU] IGLP: Fix static variables (PR #137549)

Wed Nov 26 07:15:50 PST 2025

https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/137549

>From f67dbe3da7b601e1cd1fb02b081fb259842560bb Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 26 Nov 2025 09:14:11 -0600
Subject: [PATCH] [AMDGPU] IGLP: Fix static variables

Replace global / class-level static variables with instance members and
guarantee thread-safety.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 210 ++++++++++++------
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  60 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll    |  28 +++
 3 files changed, 231 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 85addb13aef8d..e2673566f2653 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -821,6 +821,8 @@ class IGLPStrategy {
 
   const SIInstrInfo *TII;
 
+  const MachineInstr *IGLPOptMI;
+
 public:
   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
   virtual bool applyIGLPStrategy(
@@ -834,8 +836,9 @@ class IGLPStrategy {
 
   bool IsBottomUp = true;
 
-  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : DAG(DAG), TII(TII) {}
+  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+               const MachineInstr *IGLPOptMI)
+      : DAG(DAG), TII(TII), IGLPOptMI(IGLPOptMI) {}
 
   virtual ~IGLPStrategy() = default;
 };
@@ -853,8 +856,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                   const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = true;
   }
 };
@@ -887,31 +891,36 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
 class MFMAExpInterleaveOpt final : public IGLPStrategy {
 private:
   // The count of TRANS SUs involved in the interleaved pipeline
-  static unsigned TransPipeCount;
+  unsigned TransPipeCount = 0;
   // The count of MFMA SUs involved in the interleaved pipeline
-  static unsigned MFMAPipeCount;
+  unsigned MFMAPipeCount = 0;
   // The count of Add SUs involved in the interleaved pipeline
-  static unsigned AddPipeCount;
+  unsigned AddPipeCount = 0;
   // The number of transitive MFMA successors for each TRANS SU
-  static unsigned MFMAEnablement;
+  unsigned MFMAEnablement = 0;
   // The number of transitive TRANS predecessors for each MFMA SU
-  static unsigned ExpRequirement;
+  unsigned ExpRequirement = 0;
   // The count of independent "chains" of MFMA instructions in the pipeline
-  static unsigned MFMAChains;
+  unsigned MFMAChains = 0;
   // The length of each independent "chain" of MFMA instructions
-  static unsigned MFMAChainLength;
+  unsigned MFMAChainLength = 0;
   // Whether or not the pipeline has V_CVT instructions
-  static bool HasCvt;
+  bool HasCvt = false;
   // Whether or not there are instructions between the TRANS instruction and
   // V_CVT
-  static bool HasChainBetweenCvt;
+  bool HasChainBetweenCvt = false;
   // The first occuring DS_READ which feeds an MFMA chain
-  static std::optional<unsigned> FirstPipeDSR;
+  std::optional<unsigned> FirstPipeDSR = std::nullopt;
   // The MFMAPipe SUs with no MFMA predecessors
   SmallVector<SUnit *, 4> MFMAChainSeeds;
   // Compute the heuristics for the pipeline, returning whether or not the DAG
   // is well formatted for the mutation
-  bool analyzeDAG(const SIInstrInfo *TII);
+  bool analyzeDAG(const SIInstrInfo *TII, AMDGPU::SchedulingPhase Phase);
+  bool computeDAGAnalysis(const SIInstrInfo *TII,
+                          SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+  void initializeFromCache(
+      const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache);
+  bool AnalysisResult;
 
   /// Whether or not the instruction is a transitive predecessor of an MFMA
   /// instruction
@@ -1316,29 +1325,22 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override;
 
-  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                       const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = false;
   }
 };
 
-unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
-unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
-bool MFMAExpInterleaveOpt::HasCvt = false;
-bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
-std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
-
-bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
+bool MFMAExpInterleaveOpt::computeDAGAnalysis(
+    const SIInstrInfo *TII,
+    SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
   SmallVector<SUnit *, 10> ExpPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeSUs;
   SmallVector<SUnit *, 10> PackSUs;
   SmallVector<SUnit *, 10> CvtSUs;
+  const MachineInstr *FirstPipeDSRInstr = nullptr;
 
   auto isBitPack = [](unsigned Opc) {
     return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
@@ -1355,12 +1357,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
     auto Opc = SU.getInstr()->getOpcode();
     if (TII->isTRANS(Opc)) {
       // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+      // FIXME: This heuristic needs improvement/clarification!
+      // In general, the pipeline seems to look like this:
+      //   fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
+      //   (with potential arithmetic between exp and cvt)
+      //   see
+      //   https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
       if (SU.Succs.size() >= 7)
         continue;
-      for (auto &Succ : SU.Succs) {
-        if (Succ.getSUnit()->Succs.size() >= 7)
-          continue;
-      }
       ExpPipeCands.push_back(&SU);
     }
 
@@ -1445,6 +1449,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
     }
   }
 
+  MFMAChainSeeds.clear();
   MFMAChains = 0;
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
@@ -1462,8 +1467,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
 
   for (auto Pred : MFMAChainSeeds[0]->Preds) {
     if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
-        Pred.getSUnit()->getInstr()->mayLoad())
+        Pred.getSUnit()->getInstr()->mayLoad()) {
       FirstPipeDSR = Pred.getSUnit()->NodeNum;
+      FirstPipeDSRInstr = Pred.getSUnit()->getInstr();
+    }
   }
 
   MFMAChainLength = MFMAPipeCount / MFMAChains;
@@ -1507,20 +1514,76 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
       });
 
   ExpRequirement *= PackPredCount;
+
+  // Cache the results for later scheduling phases.
+  Cache.TransPipeCount = TransPipeCount;
+  Cache.MFMAPipeCount = MFMAPipeCount;
+  Cache.AddPipeCount = AddPipeCount;
+  Cache.MFMAEnablement = MFMAEnablement;
+  Cache.ExpRequirement = ExpRequirement;
+  Cache.MFMAChains = MFMAChains;
+  Cache.MFMAChainLength = MFMAChainLength;
+  Cache.HasCvt = HasCvt;
+  Cache.HasChainBetweenCvt = HasChainBetweenCvt;
+  Cache.FirstPipeDSRInstr = FirstPipeDSRInstr;
+  Cache.MFMAChainSeedInstrs.clear();
+  Cache.MFMAChainSeedInstrs.reserve(MFMAChainSeeds.size());
+  for (SUnit *Seed : MFMAChainSeeds)
+    Cache.MFMAChainSeedInstrs.push_back(Seed->getInstr());
+  Cache.AnalysisResult = true;
+
   return true;
 }
 
+void MFMAExpInterleaveOpt::initializeFromCache(
+    const SIMachineFunctionInfo::MFMAExpInterleaveCache &Cache) {
+  TransPipeCount = Cache.TransPipeCount;
+  MFMAPipeCount = Cache.MFMAPipeCount;
+  AddPipeCount = Cache.AddPipeCount;
+  MFMAEnablement = Cache.MFMAEnablement;
+  ExpRequirement = Cache.ExpRequirement;
+  MFMAChains = Cache.MFMAChains;
+  MFMAChainLength = Cache.MFMAChainLength;
+  HasCvt = Cache.HasCvt;
+  HasChainBetweenCvt = Cache.HasChainBetweenCvt;
+  SUnit *SU =
+      DAG->getSUnit(const_cast<MachineInstr *>(Cache.FirstPipeDSRInstr));
+  assert(SU && "FirstPipeDSRInstr instruction not found in DAG");
+  FirstPipeDSR = SU->NodeNum;
+  MFMAChainSeeds.clear();
+  for (const MachineInstr *MI : Cache.MFMAChainSeedInstrs) {
+    SUnit *SeedSU = DAG->getSUnit(const_cast<MachineInstr *>(MI));
+    assert(SeedSU && "MFMAChainSeed instruction not found in DAG");
+    MFMAChainSeeds.push_back(SeedSU);
+  }
+  AnalysisResult = Cache.AnalysisResult;
+}
+
+bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII,
+                                      AMDGPU::SchedulingPhase Phase) {
+  SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+
+  if (const SIMachineFunctionInfo::MFMAExpInterleaveCache *Cache =
+          MFI.getMFMAExpInterleaveCache(IGLPOptMI)) {
+    initializeFromCache(*Cache);
+    return AnalysisResult;
+  }
+
+  assert(Phase != AMDGPU::SchedulingPhase::PostRA &&
+         "PostRA phase not expected to require analyzing DAG");
+  SIMachineFunctionInfo::MFMAExpInterleaveCache Cache;
+  AnalysisResult = computeDAGAnalysis(TII, Cache);
+  if (AnalysisResult)
+    MFI.setMFMAExpInterleaveCache(IGLPOptMI, Cache);
+  return AnalysisResult;
+}
+
 bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                                AMDGPU::SchedulingPhase Phase) {
   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
-  if (Phase != AMDGPU::SchedulingPhase::PostRA)
-    MFMAChainSeeds.clear();
-  if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
-    return false;
-
-  return true;
+  return analyzeDAG(TII, Phase);
 }
 
 bool MFMAExpInterleaveOpt::applyIGLPStrategy(
@@ -1528,6 +1591,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
 
+  assert(AnalysisResult && "no or failed DAG analysis");
+
   bool IsSmallKernelType =
       MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
   bool IsLargeKernelType =
@@ -1547,18 +1612,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned CurrMFMAForTransPosition = 0;
 
   auto incrementTransPosition = [&MFMAChain, &PositionInChain,
-                                 &CurrMFMAForTransPosition]() {
+                                 &CurrMFMAForTransPosition, this]() {
     CurrMFMAForTransPosition += MFMAEnablement;
     PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
     MFMAChain = CurrMFMAForTransPosition % MFMAChains;
   };
 
-  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return (TempMFMAForTrans / MFMAChains);
   };
 
-  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return TempMFMAForTrans % MFMAChains;
   };
@@ -1568,7 +1633,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned PositionInChainForMFMA = 0;
 
   auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
-                                &PositionInChainForMFMA]() {
+                                &PositionInChainForMFMA, this]() {
     ++CurrMFMAPosition;
     MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
     PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
@@ -1826,8 +1891,9 @@ class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                             const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = true;
   }
 };
@@ -2053,25 +2119,36 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
     return true;
   }
 
-  MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {
+  MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII,
+                             const MachineInstr *IGLPOptMI)
+      : IGLPStrategy(DAG, TII, IGLPOptMI) {
     IsBottomUp = false;
   }
 };
 
-static unsigned DSWCount = 0;
-static unsigned DSWWithPermCount = 0;
-static unsigned DSWWithSharedVMEMCount = 0;
-
 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
+  unsigned DSWCount = 0;
+  unsigned DSWWithPermCount = 0;
+  unsigned DSWWithSharedVMEMCount = 0;
 
   bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
 
+  if (!IsInitial) {
+    const SIMachineFunctionInfo &MFI =
+        *DAG->MF.getInfo<SIMachineFunctionInfo>();
+    const SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache *Cache =
+        MFI.getMFMASmallGemmSingleWaveCache(IGLPOptMI);
+    assert(Cache && "no cache found");
+    DSWCount = Cache->DSWCount;
+    DSWWithPermCount = Cache->DSWWithPermCount;
+    DSWWithSharedVMEMCount = Cache->DSWWithSharedVMEMCount;
+  }
+
   assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
                          DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
@@ -2098,8 +2175,6 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
   if (IsInitial) {
     DSWWithPermCount = DSWithPerms.size();
-    auto *I = DSWithPerms.begin();
-    auto *E = DSWithPerms.end();
 
     // Get the count of DS_WRITES with V_PERM predecessors which
     // have loop carried dependencies (WAR) on the same VMEM_READs.
@@ -2109,10 +2184,10 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     // for every V_PERM pred of this DS_W.
     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
     SmallVector<SUnit *, 6> Counted;
-    for (; I != E; I++) {
+    for (SUnit *DSWrite : DSWithPerms) {
       SUnit *Cand = nullptr;
       bool MissedAny = false;
-      for (auto &Pred : (*I)->Preds) {
+      for (auto &Pred : DSWrite->Preds) {
         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;
 
@@ -2126,11 +2201,11 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
           if (MissedAny || !VMEMLookup.size()) {
             MissedAny = true;
-            VMEMLookup[MI] = *I;
+            VMEMLookup[MI] = DSWrite;
             continue;
           }
 
-          auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
+          auto [It, Inserted] = VMEMLookup.try_emplace(MI, DSWrite);
           if (Inserted) {
             MissedAny = true;
             continue;
@@ -2146,9 +2221,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
       if (!MissedAny && Cand) {
         DSWWithSharedVMEMCount += 2;
         Counted.push_back(Cand);
-        Counted.push_back(*I);
+        Counted.push_back(DSWrite);
       }
     }
+
+    SIMachineFunctionInfo::MFMASmallGemmSingleWaveCache Cache = {
+        DSWCount, DSWWithPermCount, DSWWithSharedVMEMCount};
+    SIMachineFunctionInfo &MFI = *DAG->MF.getInfo<SIMachineFunctionInfo>();
+    MFI.setMFMASmallGemmSingleWaveCache(IGLPOptMI, Cache);
   }
 
   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
@@ -2310,16 +2390,16 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
 static std::unique_ptr<IGLPStrategy>
 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
-                   const SIInstrInfo *TII) {
+                   const SIInstrInfo *TII, const MachineInstr *IGLPOptMI) {
   switch (ID) {
   case MFMASmallGemmOptID:
-    return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+    return std::make_unique<MFMASmallGemmOpt>(DAG, TII, IGLPOptMI);
   case MFMASmallGemmSingleWaveOptID:
-    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII, IGLPOptMI);
   case MFMAExpInterleaveID:
-    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
+    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII, IGLPOptMI);
   case MFMAExpSimpleInterleaveID:
-    return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+    return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII, IGLPOptMI);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -2671,7 +2751,7 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   IGLPStrategyID StrategyID =
       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
-  auto S = createIGLPStrategy(StrategyID, DAG, TII);
+  auto S = createIGLPStrategy(StrategyID, DAG, TII, SU.getInstr());
   if (!S->shouldApplyStrategy(DAG, Phase))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index d901f4c216551..96347042fd4f6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -20,6 +20,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
@@ -29,6 +30,8 @@
 
 namespace llvm {
 
+class MachineInstr;
+
 class MachineFrameInfo;
 class MachineFunction;
 class SIMachineFunctionInfo;
@@ -552,6 +555,27 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     bool IsDead = false;
   };
 
+  struct MFMASmallGemmSingleWaveCache {
+    unsigned DSWCount = 0;
+    unsigned DSWWithPermCount = 0;
+    unsigned DSWWithSharedVMEMCount = 0;
+  };
+
+  struct MFMAExpInterleaveCache {
+    SmallVector<const MachineInstr *, 4> MFMAChainSeedInstrs;
+    const MachineInstr *FirstPipeDSRInstr = nullptr;
+    unsigned TransPipeCount = 0;
+    unsigned MFMAPipeCount = 0;
+    unsigned AddPipeCount = 0;
+    unsigned MFMAEnablement = 0;
+    unsigned ExpRequirement = 0;
+    unsigned MFMAChains = 0;
+    unsigned MFMAChainLength = 0;
+    bool HasCvt = false;
+    bool HasChainBetweenCvt = false;
+    bool AnalysisResult = false;
+  };
+
 private:
   // To track virtual VGPR + lane index for each subregister of the SGPR spilled
   // to frameindex key during SILowerSGPRSpills pass.
@@ -616,7 +640,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // load/store is enabled.
   IndexedMap<uint32_t, VGPRBlock2IndexFunctor> MaskForVGPRBlockOps;
 
-private:
+  DenseMap<const MachineInstr *, MFMASmallGemmSingleWaveCache>
+      MFMASmallGemmSingleWaveCaches;
+  DenseMap<const MachineInstr *, MFMAExpInterleaveCache>
+      MFMAExpInterleaveCaches;
+
   Register VGPRForAGPRCopy;
 
   bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
@@ -634,6 +662,35 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     VGPRForAGPRCopy = NewVGPRForAGPRCopy;
   }
 
+  const MFMASmallGemmSingleWaveCache *
+  getMFMASmallGemmSingleWaveCache(const MachineInstr *MI) const {
+    if (!MI)
+      return nullptr;
+    auto It = MFMASmallGemmSingleWaveCaches.find(MI);
+    return It == MFMASmallGemmSingleWaveCaches.end() ? nullptr : &It->second;
+  }
+
+  void
+  setMFMASmallGemmSingleWaveCache(const MachineInstr *MI,
+                                  const MFMASmallGemmSingleWaveCache &Cache) {
+    assert(MI && "MachineInstr pointer must not be null");
+    MFMASmallGemmSingleWaveCaches[MI] = Cache;
+  }
+
+  const MFMAExpInterleaveCache *
+  getMFMAExpInterleaveCache(const MachineInstr *MI) const {
+    if (!MI)
+      return nullptr;
+    auto It = MFMAExpInterleaveCaches.find(MI);
+    return It == MFMAExpInterleaveCaches.end() ? nullptr : &It->second;
+  }
+
+  void setMFMAExpInterleaveCache(const MachineInstr *MI,
+                                 const MFMAExpInterleaveCache &Cache) {
+    assert(MI && "MachineInstr pointer must not be null");
+    MFMAExpInterleaveCaches[MI] = Cache;
+  }
+
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const;
 
   void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) {
@@ -649,7 +706,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return MaskForVGPRBlockOps.inBounds(RegisterBlock);
   }
 
-public:
   SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
   SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 750f72e775a95..74c7caa38d746 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,34 @@ entry:
   ret void
 }
 
+; If we run this function after test_iglp_opt_rev_mfma_gemm, we get:
+; > Assertion `(!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+; > DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA
+; > scheduling!"' failed.
+; This is because, previously, the counters were global static variables which
+; weren't reset.
+define amdgpu_kernel void @test_after_test_iglp_opt_rev_mfma_gemm(ptr %src, ptr addrspace(3) %dst) {
+; GCN-LABEL: test_after_test_iglp_opt_rev_mfma_gemm:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    ; iglp_opt mask(0x00000001)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    ds_write_b8 v1, v0
+; GCN-NEXT:    s_endpgm
+entry:
+  %a = load i1, ptr %src, align 1
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  store i1 %a, ptr addrspace(3) %dst, align 1
+  ret void
+}
+
 define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_asm_sideeffect:
 ; GCN:       ; %bb.0: ; %entry