[llvm] Reenable sched mfma rewrite (PR #180751)

Tue Feb 10 07:15:59 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Tony Linthicum (tlinthic)

<details>
<summary>Changes</summary>

After performance testing, it was determined that the large number of copies
that are inserted outside the loop are more than offset by better allocation
within the loop as a result of the rewrite.  Additionally, there are two other 
fairly minor changes in this commit:

  - Fix for upstream issue #177696.  The need for this fix will go away
    when we remove the need to do an early conversion to MFMA form in order to
    get proper spill cost analysis of the potential rewritten code.

  - Convert cost logic to double from int64.  This cleans up the logic a bit.


---
Full diff: https://github.com/llvm/llvm-project/pull/180751.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+45-39) 
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.h (+1-1) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 92c09b12c1230..09e24f3074e89 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -101,7 +101,7 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
 
 static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
     "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
-    cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true));
+    cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(false));
 
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
@@ -1330,11 +1330,11 @@ bool RewriteMFMAFormStage::initGCNSchedStage() {
   if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
     return false;
 
-  int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
+  double Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
 
   // If we haven't found the beneficial conditions, prefer the VGPR form which
   // may result in less cross RC copies.
-  if (Cost > 0)
+  if (Cost > 0.0)
     return false;
 
   return rewrite(RewriteCands);
@@ -2182,7 +2182,6 @@ bool RewriteMFMAFormStage::initHeuristics(
 
       int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
       assert(ReplacementOp != -1);
-
       RewriteCands.push_back({&MI, MI.getOpcode()});
       MI.setDesc(TII->get(ReplacementOp));
 
@@ -2234,8 +2233,14 @@ bool RewriteMFMAFormStage::initHeuristics(
       const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
       const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
       DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
-      if (Src2->isReg())
+      if (Src2->isReg()) {
+        // Have to get src types separately since subregs may cause C and D
+        // registers to be different types even though the actual operand is
+        // the same size.
+        AGPRRC = DAG.MRI.getRegClass(Src2->getReg());
+        VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
         DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      }
       Changed = true;
     }
   }
@@ -2243,15 +2248,14 @@ bool RewriteMFMAFormStage::initHeuristics(
   return Changed;
 }
 
-int64_t RewriteMFMAFormStage::getRewriteCost(
+double RewriteMFMAFormStage::getRewriteCost(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
   MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
 
-  int64_t BestSpillCost = 0;
-  int64_t Cost = 0;
-  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+  double BestSpillCost = 0.0;
+  double Cost = 0.0;
 
   std::pair<unsigned, unsigned> MaxVectorRegs =
       ST.getMaxNumVectorRegs(MF.getFunction());
@@ -2266,6 +2270,8 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     GCNRegPressure &PressureBefore = DAG.Pressure[Region];
     unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+    LLVM_DEBUG(dbgs() << "RewriteMFMA: Region " << Region
+                      << " spill cost before: " << SpillCostBefore << "\n");
 
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
@@ -2274,29 +2280,21 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
     unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+    LLVM_DEBUG(dbgs() << "RewriteMFMA: Region " << Region
+                      << " spill cost after: " << SpillCostAfter << "\n");
 
-    uint64_t BlockFreq =
-        MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
-            .getFrequency();
-
-    bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
-    uint64_t RelativeFreq = EntryFreq && BlockFreq
-                                ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
-                                                       : BlockFreq / EntryFreq)
-                                : 1;
+    MachineBasicBlock *MBB = DAG.Regions[Region].first->getParent();
+    double BlockFreq = MBFI->getBlockFreqRelativeToEntryBlock(MBB);
 
     // This assumes perfect spilling / splitting -- using one spill / copy
     // instruction and one restoreFrom / copy for each excess register,
-    int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+    double SpillCost = ((double)SpillCostAfter - (double)SpillCostBefore) * 2;
 
     // Also account for the block frequency.
-    if (RelativeFreqIsDenom)
-      SpillCost /= (int64_t)RelativeFreq;
-    else
-      SpillCost *= (int64_t)RelativeFreq;
+    SpillCost *= BlockFreq;
 
     // If we have increased spilling in any block, just bail.
-    if (SpillCost > 0)
+    if (SpillCost > 0.0)
       return SpillCost;
 
     if (SpillCost < BestSpillCost)
@@ -2305,35 +2303,37 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
 
   // Set the cost to the largest decrease in spill cost in order to not double
   // count spill reductions.
+  LLVM_DEBUG(dbgs() << "RewriteMFMA: BestSpillCost: " << BestSpillCost << "\n");
   Cost = BestSpillCost;
-  assert(Cost <= 0);
-
-  unsigned CopyCost = 0;
+  assert(Cost <= 0.0);
 
   // For each CopyForDef, increase the cost by the register size while
   // accounting for block frequency.
+  double DefCopyCost = 0.0;
   for (MachineInstr *DefMI : CopyForDef) {
     Register DefReg = DefMI->getOperand(0).getReg();
-    uint64_t DefFreq =
-        EntryFreq
-            ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
-            : 1;
+    MachineBasicBlock *DefMBB = DefMI->getParent();
+    double DefFreq = MBFI->getBlockFreqRelativeToEntryBlock(DefMBB);
 
     const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
-    CopyCost += RC->getCopyCost() * DefFreq;
+    DefCopyCost += (double)RC->getCopyCost() * DefFreq;
   }
+  LLVM_DEBUG(dbgs() << "RewriteMFMA: Def copy Costs: " << DefCopyCost << "\n");
 
   // Account for CopyForUse copies in each block that the register is used.
+  double UseCopyCost = 0.0;
   for (auto &[UseBlock, UseRegs] : CopyForUse) {
-    uint64_t UseFreq =
-        EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+    uint64_t UseFreq = MBFI->getBlockFreqRelativeToEntryBlock(UseBlock);
 
     for (Register UseReg : UseRegs) {
       const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
-      CopyCost += RC->getCopyCost() * UseFreq;
+      UseCopyCost += (double)RC->getCopyCost() * UseFreq;
     }
   }
 
+  LLVM_DEBUG(dbgs() << "RewriteMFMA: Use copy Costs: " << UseCopyCost << "\n");
+  double CopyCost = UseCopyCost + DefCopyCost;
+
   // Reset the classes that were changed to AGPR for better RB analysis.
   // We must do rewriting after copy-insertion, as some defs of the register
   // may require VGPR.  Additionally, if we bail out and don't perform the
@@ -2343,14 +2343,20 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     const TargetRegisterClass *AGPRRC =
         DAG.MRI.getRegClass(MI->getOperand(0).getReg());
     const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(OriginalOpcode));
 
     MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
     assert(Src2);
+    if (!Src2->isReg())
+      continue;
 
-    if (Src2->isReg())
-      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
-    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
-    MI->setDesc(TII->get(OriginalOpcode));
+    // Have to get src types separately since subregs may cause C and D
+    // registers to be different types even though the actual operand is
+    // the same size.
+    AGPRRC = DAG.MRI.getRegClass(Src2->getReg());
+    VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+    DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
   }
 
   return Cost + CopyCost;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index ea97e8e74f41b..59897eab95235 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -442,7 +442,7 @@ class RewriteMFMAFormStage : public GCNSchedStage {
   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
   /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
-  int64_t getRewriteCost(
+  double getRewriteCost(
       const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
       const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
       const SmallPtrSetImpl<MachineInstr *> &CopyForDef);

``````````

</details>


https://github.com/llvm/llvm-project/pull/180751