[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #149367)

Tue Aug 26 09:18:32 PDT 2025

================
@@ -1642,6 +1755,534 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+  if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+    return false;
+  return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteScheduleStage::initHeuristics(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  // Prepare for the heuristics
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (isRewriteCandidate(&MI)) {
+        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+        if (ReplacementOp == -1)
+          continue;
+
+        RewriteCands.push_back({&MI, MI.getOpcode()});
+        MI.setDesc(TII->get(ReplacementOp));
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (Src2->isReg()) {
+          SmallVector<SlotIndex, 8> Src2ReachingDefs;
+          findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+          // For any definition of the src2 register which is non-MFMA, we
+          // insert a copy.
+          for (SlotIndex RDIdx : Src2ReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+            if (!TII->isMAI(*RD))
+              CopyForDef.insert(RD);
+          }
+        }
+
+        MachineOperand &Dst = MI.getOperand(0);
+        SmallVector<MachineOperand *, 8> DstReachingUses;
+
+        findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+        for (MachineOperand *RUOp : DstReachingUses) {
+          if (TII->isMAI(*RUOp->getParent()))
+            continue;
+
+          // For any user of the result of the MFMA which is not an MFMA, we
+          // insert a copy. For a given register, we will only insert one copy
+          // per user block.
+          CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+          SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+          findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+          for (auto RDIndex : DstUsesReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+            if (TII->isMAI(*RD))
+              continue;
+
+            // For any definition of the user of the MFMA which is not an MFMA,
+            // we insert a copy. We do this to transform all the reaching defs
+            // of this use to AGPR. By doing this, we can insert a copy from
+            // AGPR to VGPR at the user rather than after the MFMA.
+            CopyForDef.insert(RD);
+          }
+        }
+
+        // Do the rewrite to allow for updated RP calculation.
+        const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+        const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+        DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+        if (Src2->isReg())
+          DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      }
+    }
+  }
+
+  return true;
+}
+
+int64_t RewriteScheduleStage::getRewriteCost(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MBFI.calculate(MF, MBPI, *DAG.MLI);
+  int64_t BestSpillCost = 0;
+  int64_t Cost = 0;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    if (!RegionsWithExcessArchVGPR[Region])
+      continue;
+
+    auto PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+
+    // For the cases we care about (i.e. ArchVGPR usage is greater than the
+    // addressable limit), rewriting alone should bring pressure to manageable
+    // level. If we find any such region, then the rewrite is potentially
+    // beneficial.
+    auto PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+
+    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+    uint64_t BlockFreq =
+        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+            .getFrequency();
+
+    bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+    uint64_t RelativeFreq = EntryFreq && BlockFreq
+                                ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+                                                       : BlockFreq / EntryFreq)
+                                : 1;
+
+    // This assumes perfect spilling / splitting -- using one spill / copy
+    // instruction and one restoreFrom / copy for each excess register,
+    int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+    // Also account for the block frequency.
+    if (RelativeFreqIsDenom)
+      SpillCost /= (int64_t)RelativeFreq;
+    else
+      SpillCost *= (int64_t)RelativeFreq;
+
+    // If we have increased spilling in any block, just bail.
+    if (SpillCost > 0)
+      return SpillCost;
+
+    if (SpillCost < BestSpillCost)
+      BestSpillCost = SpillCost;
+  }
+
+  // Set the cost to the largest decrease in spill cost in order to not double
+  // count spill reductions.
+  Cost = BestSpillCost;
+
+  assert(Cost <= 0);
+
+  unsigned CopyCost = 0;
+
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
+  // For each CopyForDef, increase the cost by the register size while
+  // accounting for block frequency.
+  for (auto *DefMI : CopyForDef) {
+    auto DefReg = DefMI->getOperand(0).getReg();
+    uint64_t DefFreq =
+        EntryFreq
+            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            : 1;
+
+    unsigned RegSize = DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(DefReg));
+    unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+    CopyCost += NumRegs * DefFreq;
+  }
+
+  // Account for CopyForUse copies in each block that the register is used.
+  for (auto &UseEntry : CopyForUse) {
+    uint64_t UseFreq =
+        EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
+                  : 1;
+
+    for (auto UseReg : UseEntry.second) {
+      unsigned RegSize =
+          DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
+      unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+      CopyCost += NumRegs * UseFreq;
+    }
+  }
+
+  Cost += CopyCost;
+
+  // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
+  // defs of the register may require VGPR.
+  for (auto RI : RewriteCands) {
+    MachineInstr *MI = RI.first;
+
+    assert(TII->isMAI(*MI));
+    const TargetRegisterClass *AGPRRC =
+        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    assert(Src2);
+
+    if (Src2->isReg()) {
+      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+    }
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(RI.second));
+  }
+
+  return Cost;
+}
+
+bool RewriteScheduleStage::rewrite(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+  DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    auto Entry = DAG.Regions[Region];
+    if (Entry.first == Entry.second)
+      continue;
+
+    FirstMIToRegion[&*Entry.first] = Region;
+    if (Entry.second != Entry.first->getParent()->end())
+      LastMIToRegion[&*Entry.second] = Region;
+  }
+
+  // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+  // The general assumption of the algorithm (and the previous cost calculation)
+  // is that it is better to insert the copies in the MBB of the def of the src2
+  // operands, and in the MBB of the user of the dest operands. This is based on
+  // the assumption that the MFMAs are likely to appear in loop bodies, while
+  // the src2 and dest operands are live-in / live-out of the loop. Due to this
+  // design, the algorithm for finding copy insertion points is more
+  // complicated.
+  //
+  // There are three main cases to handle: 1. the reaching defs of the src2
+  // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+  // defs of the reaching uses of the dst operand.
+  //
+  // In the first case, we simply insert copies after each of the reaching
+  // definitions. In the second case, we collect all the uses of a given dest
+  // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+  // earliest use. Since the use may have multiple reaching defs, and since we
+  // want to replace the register it is using with the result of the copy, we
+  // must handle case 3. In the third case, we simply insert a copy after each
+  // of the reaching defs to connect to the copy of the reaching uses of the dst
+  // reg. This allows us to avoid inserting copies next to the' MFMAs.
----------------
lucas-rami wrote:

```suggestion
  // reg. This allows us to avoid inserting copies next to the MFMAs.
```

https://github.com/llvm/llvm-project/pull/149367