[llvm] AMDGPU: Delete FillMFMAShadowMutation (PR #123861)

Wed Jan 22 05:47:00 PST 2025

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/123861

>From 6329b8198fa1334af6ec05a66d8d3393bbad9b63 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 21 Jan 2025 14:48:33 +0700
Subject: [PATCH 1/3] AMDGPU: Delete FillMFMAShadowMutation

No test changes with this removed and it appears to
be obsolete.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   1 -
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       | 111 ------------------
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   7 --
 3 files changed, 119 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3fe17457cb3606..4751aab5f826d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1074,7 +1074,6 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
         createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 117afc4a8e8c60..75baf52e427a8a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -586,117 +586,6 @@ void GCNSubtarget::adjustSchedDependency(
   }
 }
 
-namespace {
-struct FillMFMAShadowMutation : ScheduleDAGMutation {
-  const SIInstrInfo *TII;
-
-  ScheduleDAGMI *DAG;
-
-  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
-
-  bool isSALU(const SUnit *SU) const {
-    const MachineInstr *MI = SU->getInstr();
-    return MI && TII->isSALU(*MI) && !MI->isTerminator();
-  }
-
-  bool isVALU(const SUnit *SU) const {
-    const MachineInstr *MI = SU->getInstr();
-    return MI && TII->isVALU(*MI);
-  }
-
-  // Link as many SALU instructions in chain as possible. Return the size
-  // of the chain. Links up to MaxChain instructions.
-  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
-                         SmallPtrSetImpl<SUnit *> &Visited) const {
-    SmallVector<SUnit *, 8> Worklist({To});
-    unsigned Linked = 0;
-
-    while (!Worklist.empty() && MaxChain-- > 0) {
-      SUnit *SU = Worklist.pop_back_val();
-      if (!Visited.insert(SU).second)
-        continue;
-
-      LLVM_DEBUG(dbgs() << "Inserting edge from\n"; DAG->dumpNode(*From);
-                 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
-
-      if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
-        if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
-          ++Linked;
-
-      for (SDep &SI : From->Succs) {
-        SUnit *SUv = SI.getSUnit();
-        if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
-            DAG->canAddEdge(SUv, SU))
-          DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
-      }
-
-      for (SDep &SI : SU->Succs) {
-        SUnit *Succ = SI.getSUnit();
-        if (Succ != SU && isSALU(Succ))
-          Worklist.push_back(Succ);
-      }
-    }
-
-    return Linked;
-  }
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override {
-    const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasMAIInsts())
-      return;
-    DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
-    const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
-    if (!TSchedModel || DAG->SUnits.empty())
-      return;
-
-    // Scan for MFMA long latency instructions and try to add a dependency
-    // of available SALU instructions to give them a chance to fill MFMA
-    // shadow. That is desirable to fill MFMA shadow with SALU instructions
-    // rather than VALU to prevent power consumption bursts and throttle.
-    auto LastSALU = DAG->SUnits.begin();
-    auto E = DAG->SUnits.end();
-    SmallPtrSet<SUnit *, 32> Visited;
-    for (SUnit &SU : DAG->SUnits) {
-      MachineInstr &MAI = *SU.getInstr();
-      if (!TII->isMAI(MAI) ||
-          MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
-          MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
-        continue;
-
-      unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
-
-      LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
-                 dbgs() << "Need " << Lat
-                        << " instructions to cover latency.\n");
-
-      // Find up to Lat independent scalar instructions as early as
-      // possible such that they can be scheduled after this MFMA.
-      for (; Lat && LastSALU != E; ++LastSALU) {
-        if (Visited.count(&*LastSALU))
-          continue;
-
-        if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
-            !DAG->canAddEdge(&*LastSALU, &SU))
-          continue;
-
-        Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
-      }
-    }
-  }
-};
-} // namespace
-
-void GCNSubtarget::getPostRAMutations(
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
-}
-
-std::unique_ptr<ScheduleDAGMutation>
-GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
-  return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
-                          : nullptr;
-}
-
 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX12)
     return 0; // Not MIMG encoding.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 3388bc3c5a8de1..e0b0000f757faf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1575,13 +1575,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
-  void getPostRAMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
-      const override;
-
-  std::unique_ptr<ScheduleDAGMutation>
-  createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
-
   bool isWave32() const {
     return getWavefrontSize() == 32;
   }

>From 810d7f9b16559594a813f2fbb8ba06c8b096fa80 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 22 Jan 2025 09:38:59 +0700
Subject: [PATCH 2/3] Remove EnablePowerSched

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 75baf52e427a8a..413c2884c034ea 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -38,11 +38,6 @@ using namespace llvm;
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef AMDGPUSubtarget
 
-static cl::opt<bool>
-    EnablePowerSched("amdgpu-enable-power-sched",
-                     cl::desc("Enable scheduling to minimize mAI power bursts"),
-                     cl::init(false));
-
 static cl::opt<bool> EnableVGPRIndexMode(
     "amdgpu-vgpr-index-mode",
     cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),

>From 4c32b5c7b4647287af7439ef6d1d4c1bfedfba36 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 22 Jan 2025 20:46:29 +0700
Subject: [PATCH 3/3] Delete test for the flag

---
 .../CodeGen/AMDGPU/power-sched-no-cycle.mir   | 26 -------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/power-sched-no-cycle.mir

diff --git a/llvm/test/CodeGen/AMDGPU/power-sched-no-cycle.mir b/llvm/test/CodeGen/AMDGPU/power-sched-no-cycle.mir
deleted file mode 100644
index 2c1880c88631ec..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/power-sched-no-cycle.mir
+++ /dev/null
@@ -1,26 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-enable-power-sched=true  2>&1 | FileCheck %s
-# This test represents a pattern which caused power-sched to introduce cycles into the Scheduling graph. By virtue of this test not crashing indicates it has completed succesfully.
-
----
-name:            power_sched_cycle_condition
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $vgpr1, $sgpr2_sgpr3, $sgpr1, $vgpr72_vgpr73, $vgpr2_vgpr3, $sgpr22, $sgpr6_sgpr7, $sgpr10_sgpr11
-    ; CHECK-LABEL: name: power_sched_cycle_condition
-    ; CHECK: liveins: $vgpr1, $sgpr2_sgpr3, $sgpr1, $vgpr72_vgpr73, $vgpr2_vgpr3, $sgpr22, $sgpr6_sgpr7, $sgpr10_sgpr11
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $sgpr1 = S_LOAD_DWORD_IMM $sgpr2_sgpr3, 56, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; CHECK-NEXT: $sgpr4 = S_LSHL_B32 killed $sgpr22, 1, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr22_sgpr23 = S_LOAD_DWORDX2_IMM killed $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; CHECK-NEXT: $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr1, implicit $exec
-    ; CHECK-NEXT: early-clobber $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr72_vgpr73, 0, 0, 0, 0, implicit $mode, implicit $exec
-    $sgpr1 = S_LOAD_DWORD_IMM $sgpr2_sgpr3, 56, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    $vgpr2 = nsw V_MUL_LO_U32_e64 $vgpr1, $sgpr1, implicit $exec
-    $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21 =  V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr72_vgpr73, 0, 0, 0, 0, implicit $mode, implicit $exec
-    $sgpr4 = S_LSHL_B32 $sgpr22, 1, implicit-def dead $scc
-    $sgpr22_sgpr23 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-...
-
-