[llvm] [AMDGPU] IGLP: Fix static variables (PR #137549)

Sun Apr 27 14:09:32 PDT 2025

https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/137549

Replace global / class-level static variables with instance members and guarantee thread-safety.

@arsenm as discussed

Some notable implications of these changes:
- `MFMASmallGemmSingleWaveOpt::applyIGLPStrategy()`: no caching anymore between scheduling phases
- `MFMAExpInterleaveOpt::analyzeDAG()`: needs to be able to run on post-ra DAG as well, which especially affects the dependency handling

I tried to keep the changes as non-invasive as possible and without functional change.

>From 0f693669e4e9682f69907932e46e219c09d421c2 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Sun, 27 Apr 2025 15:27:29 -0500
Subject: [PATCH] [AMDGPU] IGLP: Fix static variables

Replace global / class-level static variables with instance members and
guarantee thread-safety.
---
 llvm/include/llvm/CodeGen/ScheduleDAG.h       |   7 +-
 llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h |   6 +-
 llvm/lib/CodeGen/ScheduleDAG.cpp              |  12 +-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 191 ++++++++----------
 4 files changed, 106 insertions(+), 110 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
index 1c8d92d149adc..f57bc55bf3131 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -734,7 +734,8 @@ class TargetRegisterInfo;
     /// Makes a DFS traversal and mark all nodes affected by the edge insertion.
     /// These nodes will later get new topological indexes by means of the Shift
     /// method.
-    void DFS(const SUnit *SU, int UpperBound, bool& HasLoop);
+    void DFS(const SUnit *SU, int UpperBound, bool &HasLoop,
+             std::optional<SDep::Kind> OnlyDepKind = std::nullopt);
 
     /// Reassigns topological indexes for the nodes in the DAG to
     /// preserve the topological ordering.
@@ -767,7 +768,9 @@ class TargetRegisterInfo;
                                  bool &Success);
 
     /// Checks if \p SU is reachable from \p TargetSU.
-    bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
+    /// If OnlyDepKind is given, consider only dependencies of this kind.
+    bool IsReachable(const SUnit *SU, const SUnit *TargetSU,
+                     std::optional<SDep::Kind> OnlyDepKind = std::nullopt);
 
     /// Returns true if addPred(TargetSU, SU) creates a cycle.
     bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index e79b03c57a1e8..f1d8852c75005 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -287,8 +287,10 @@ namespace llvm {
     }
 
     /// IsReachable - Checks if SU is reachable from TargetSU.
-    bool IsReachable(SUnit *SU, SUnit *TargetSU) {
-      return Topo.IsReachable(SU, TargetSU);
+    /// If OnlyDepKind is given, only dependencies of this kind are considered.
+    bool IsReachable(SUnit *SU, SUnit *TargetSU,
+                     std::optional<SDep::Kind> OnlyDepKind = std::nullopt) {
+      return Topo.IsReachable(SU, TargetSU, OnlyDepKind);
     }
 
     /// Whether regions with a single MI should be scheduled.
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index 26857edd871e2..dec508f2f763d 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -570,7 +570,8 @@ void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
 }
 
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
-                                     bool &HasLoop) {
+                                     bool &HasLoop,
+                                     std::optional<SDep::Kind> OnlyDepKind) {
   std::vector<const SUnit*> WorkList;
   WorkList.reserve(SUnits.size());
 
@@ -580,6 +581,8 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     WorkList.pop_back();
     Visited.set(SU->NodeNum);
     for (const SDep &SuccDep : llvm::reverse(SU->Succs)) {
+      if (OnlyDepKind && SuccDep.getKind() != *OnlyDepKind)
+        continue;
       unsigned s = SuccDep.getSUnit()->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (s >= Node2Index.size())
@@ -722,8 +725,9 @@ void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) {
   Visited.resize(Node2Index.size());
 }
 
-bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
-                                             const SUnit *TargetSU) {
+bool ScheduleDAGTopologicalSort::IsReachable(
+    const SUnit *SU, const SUnit *TargetSU,
+    std::optional<SDep::Kind> OnlyDepKind) {
   assert(TargetSU != nullptr && "Invalid target SUnit");
   assert(SU != nullptr && "Invalid SUnit");
   FixOrder();
@@ -737,7 +741,7 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   if (LowerBound < UpperBound) {
     Visited.reset();
     // There may be a path from TargetSU to SU. Check for it.
-    DFS(TargetSU, UpperBound, HasLoop);
+    DFS(TargetSU, UpperBound, HasLoop, OnlyDepKind);
   }
   return HasLoop;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index a2b7cd088093a..4e5650d18b35a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -229,13 +229,13 @@ class SchedGroup {
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
-    SGID = NumSchedGroups++;
+    SGID = __atomic_fetch_add(&NumSchedGroups, 1, __ATOMIC_SEQ_CST);
   }
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
-    SGID = NumSchedGroups++;
+    SGID = __atomic_fetch_add(&NumSchedGroups, 1, __ATOMIC_SEQ_CST);
   }
 };
 
@@ -887,26 +887,26 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
 class MFMAExpInterleaveOpt final : public IGLPStrategy {
 private:
   // The count of TRANS SUs involved in the interleaved pipeline
-  static unsigned TransPipeCount;
+  unsigned TransPipeCount = 0;
   // The count of MFMA SUs involved in the interleaved pipeline
-  static unsigned MFMAPipeCount;
+  unsigned MFMAPipeCount = 0;
   // The count of Add SUs involved in the interleaved pipeline
-  static unsigned AddPipeCount;
+  unsigned AddPipeCount = 0;
   // The number of transitive MFMA successors for each TRANS SU
-  static unsigned MFMAEnablement;
+  unsigned MFMAEnablement = 0;
   // The number of transitive TRANS predecessors for each MFMA SU
-  static unsigned ExpRequirement;
+  unsigned ExpRequirement = 0;
   // The count of independent "chains" of MFMA instructions in the pipeline
-  static unsigned MFMAChains;
+  unsigned MFMAChains = 0;
   // The length of each independent "chain" of MFMA instructions
-  static unsigned MFMAChainLength;
+  unsigned MFMAChainLength = 0;
   // Whether or not the pipeline has V_CVT instructions
-  static bool HasCvt;
+  bool HasCvt = false;
   // Whether or not there are instructions between the TRANS instruction and
   // V_CVT
-  static bool HasChainBetweenCvt;
+  bool HasChainBetweenCvt = false;
   // The first occuring DS_READ which feeds an MFMA chain
-  static std::optional<unsigned> FirstPipeDSR;
+  std::optional<unsigned> FirstPipeDSR = std::nullopt;
   // The MFMAPipe SUs with no MFMA predecessors
   SmallVector<SUnit *, 4> MFMAChainSeeds;
   // Compute the heuristics for the pipeline, returning whether or not the DAG
@@ -1325,17 +1325,8 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
   }
 };
 
-unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
-unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
-unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
-unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
-bool MFMAExpInterleaveOpt::HasCvt = false;
-bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
-std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
-
+// Note: we only want to check for true (data) dependencies (see SDep::Kind)
+// so that the logic also works in the PostRA phase.
 bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   SmallVector<SUnit *, 10> ExpPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeCands;
@@ -1353,17 +1344,25 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
 
   auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
 
+  // Heuristic helper function (see below)
+  auto IsFMACDataDep = [](SDep &Dep) {
+    return Dep.getKind() == SDep::Kind::Data &&
+           Dep.getSUnit()->getInstr()->getOpcode() == AMDGPU::V_FMAC_F32_e32;
+  };
+
   AddPipeCount = 0;
   for (SUnit &SU : DAG->SUnits) {
     auto Opc = SU.getInstr()->getOpcode();
     if (TII->isTRANS(Opc)) {
       // Avoid counting a potential bonus V_EXP which all the MFMA depend on
-      if (SU.Succs.size() >= 7)
+      // FIXME: This heuristic needs improvement/clarification!
+      // In general, the pipeline seems to look like this:
+      //   fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16
+      //   (with potential arithmetic between exp and cvt)
+      //   see
+      //   https://github.com/llvm/llvm-project/pull/80370#discussion_r1483660378
+      if (SU.Succs.size() >= 7 && any_of(SU.Succs, IsFMACDataDep))
         continue;
-      for (auto &Succ : SU.Succs) {
-        if (Succ.getSUnit()->Succs.size() >= 7)
-          continue;
-      }
       ExpPipeCands.push_back(&SU);
     }
 
@@ -1390,7 +1389,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   // Count the number of EXPs that reach an MFMA
   for (auto &PredSU : ExpPipeCands) {
     for (auto &SuccSU : MFMAPipeCands) {
-      if (DAG->IsReachable(SuccSU, PredSU)) {
+      if (DAG->IsReachable(SuccSU, PredSU, SDep::Kind::Data)) {
         if (!TempExp) {
           TempExp = PredSU;
           TempMFMA = SuccSU;
@@ -1418,7 +1417,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
       continue;
 
     for (auto &PredSU : ExpPipeCands) {
-      if (DAG->IsReachable(SuccSU, PredSU)) {
+      if (DAG->IsReachable(SuccSU, PredSU, SDep::Kind::Data)) {
         MFMAPipeSUs.push_back(SuccSU);
         break;
       }
@@ -1432,7 +1431,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
 
   std::optional<SUnit *> TempCvt;
   for (auto &SuccSU : CvtSUs) {
-    if (DAG->IsReachable(SuccSU, *TempExp)) {
+    if (DAG->IsReachable(SuccSU, *TempExp, SDep::Kind::Data)) {
       TempCvt = SuccSU;
       break;
     }
@@ -1441,13 +1440,14 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   HasCvt = false;
   if (TempCvt.has_value()) {
     for (auto &SuccSU : MFMAPipeSUs) {
-      if (DAG->IsReachable(SuccSU, *TempCvt)) {
+      if (DAG->IsReachable(SuccSU, *TempCvt, SDep::Kind::Data)) {
         HasCvt = true;
         break;
       }
     }
   }
 
+  MFMAChainSeeds.clear();
   MFMAChains = 0;
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
@@ -1474,7 +1474,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   // The number of bit pack operations that depend on a single V_EXP
   unsigned PackSuccCount =
       llvm::count_if(PackSUs, [this, &TempExp](SUnit *VPack) {
-        return DAG->IsReachable(VPack, *TempExp);
+        return DAG->IsReachable(VPack, *TempExp, SDep::Kind::Data);
       });
 
   // The number of bit pack operations an MFMA depends on
@@ -1504,10 +1504,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   MFMAEnablement *= PackSuccCount;
 
   // The number of V_EXPs required to resolve all dependencies for an MFMA
-  ExpRequirement =
-      llvm::count_if(ExpPipeCands, [this, &PackPred](SUnit *ExpBase) {
-        return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
-      });
+  ExpRequirement = llvm::count_if(ExpPipeCands, [this,
+                                                 &PackPred](SUnit *ExpBase) {
+    return DAG->IsReachable(PackPred->getSUnit(), ExpBase, SDep::Kind::Data);
+  });
 
   ExpRequirement *= PackPredCount;
   return true;
@@ -1518,12 +1518,7 @@ bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
-  if (Phase != AMDGPU::SchedulingPhase::PostRA)
-    MFMAChainSeeds.clear();
-  if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
-    return false;
-
-  return true;
+  return analyzeDAG(TII);
 }
 
 bool MFMAExpInterleaveOpt::applyIGLPStrategy(
@@ -1550,18 +1545,18 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned CurrMFMAForTransPosition = 0;
 
   auto incrementTransPosition = [&MFMAChain, &PositionInChain,
-                                 &CurrMFMAForTransPosition]() {
+                                 &CurrMFMAForTransPosition, this]() {
     CurrMFMAForTransPosition += MFMAEnablement;
     PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
     MFMAChain = CurrMFMAForTransPosition % MFMAChains;
   };
 
-  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return (TempMFMAForTrans / MFMAChains);
   };
 
-  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition, this]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return TempMFMAForTrans % MFMAChains;
   };
@@ -1571,7 +1566,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
   unsigned PositionInChainForMFMA = 0;
 
   auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
-                                &PositionInChainForMFMA]() {
+                                &PositionInChainForMFMA, this]() {
     ++CurrMFMAPosition;
     MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
     PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
@@ -2062,22 +2057,16 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   }
 };
 
-static unsigned DSWCount = 0;
-static unsigned DSWWithPermCount = 0;
-static unsigned DSWWithSharedVMEMCount = 0;
-
 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
+  unsigned DSWCount = 0;
+  unsigned DSWWithPermCount = 0;
+  unsigned DSWWithSharedVMEMCount = 0;
 
-  bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
-
-  assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
-                         DSWWithSharedVMEMCount == 0)) &&
-         "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
     auto *I = SU.getInstr();
@@ -2086,7 +2075,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore() && IsInitial) {
+      else if (I->mayStore()) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -2099,58 +2088,56 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     }
   }
 
-  if (IsInitial) {
-    DSWWithPermCount = DSWithPerms.size();
-    auto *I = DSWithPerms.begin();
-    auto *E = DSWithPerms.end();
-
-    // Get the count of DS_WRITES with V_PERM predecessors which
-    // have loop carried dependencies (WAR) on the same VMEM_READs.
-    // We consider partial overlap as a miss -- in other words,
-    // for a given DS_W, we only consider another DS_W as matching
-    // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
-    // for every V_PERM pred of this DS_W.
-    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
-    SmallVector<SUnit *, 6> Counted;
-    for (; I != E; I++) {
-      SUnit *Cand = nullptr;
-      bool MissedAny = false;
-      for (auto &Pred : (*I)->Preds) {
-        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
-          continue;
+  DSWWithPermCount = DSWithPerms.size();
+  auto *I = DSWithPerms.begin();
+  auto *E = DSWithPerms.end();
+
+  // Get the count of DS_WRITES with V_PERM predecessors which
+  // have loop carried dependencies (WAR) on the same VMEM_READs.
+  // We consider partial overlap as a miss -- in other words,
+  // for a given DS_W, we only consider another DS_W as matching
+  // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+  // for every V_PERM pred of this DS_W.
+  DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+  SmallVector<SUnit *, 6> Counted;
+  for (; I != E; I++) {
+    SUnit *Cand = nullptr;
+    bool MissedAny = false;
+    for (auto &Pred : (*I)->Preds) {
+      if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
+        continue;
 
-        if (Cand && llvm::is_contained(Counted, Cand))
-          break;
+      if (Cand && llvm::is_contained(Counted, Cand))
+        break;
 
-        for (auto &Succ : Pred.getSUnit()->Succs) {
-          auto *MI = Succ.getSUnit()->getInstr();
-          if (!TII->isVMEM(*MI) || !MI->mayLoad())
-            continue;
+      for (auto &Succ : Pred.getSUnit()->Succs) {
+        auto *MI = Succ.getSUnit()->getInstr();
+        if (!TII->isVMEM(*MI) || !MI->mayLoad())
+          continue;
 
-          if (MissedAny || !VMEMLookup.size()) {
-            MissedAny = true;
-            VMEMLookup[MI] = *I;
-            continue;
-          }
+        if (MissedAny || !VMEMLookup.size()) {
+          MissedAny = true;
+          VMEMLookup[MI] = *I;
+          continue;
+        }
 
-          auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
-          if (Inserted) {
-            MissedAny = true;
-            continue;
-          }
+        auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
+        if (Inserted) {
+          MissedAny = true;
+          continue;
+        }
 
-          Cand = It->second;
-          if (llvm::is_contained(Counted, Cand)) {
-            MissedAny = true;
-            break;
-          }
+        Cand = It->second;
+        if (llvm::is_contained(Counted, Cand)) {
+          MissedAny = true;
+          break;
         }
       }
-      if (!MissedAny && Cand) {
-        DSWWithSharedVMEMCount += 2;
-        Counted.push_back(Cand);
-        Counted.push_back(*I);
-      }
+    }
+    if (!MissedAny && Cand) {
+      DSWWithSharedVMEMCount += 2;
+      Counted.push_back(Cand);
+      Counted.push_back(*I);
     }
   }