[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #149367)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 26 09:18:32 PDT 2025
================
@@ -1642,6 +1755,534 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+ if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+ return false;
+ return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteScheduleStage::initHeuristics(
+ std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+ // Prepare for the heuristics
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (isRewriteCandidate(&MI)) {
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+ if (ReplacementOp == -1)
+ continue;
+
+ RewriteCands.push_back({&MI, MI.getOpcode()});
+ MI.setDesc(TII->get(ReplacementOp));
+
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2->isReg()) {
+ SmallVector<SlotIndex, 8> Src2ReachingDefs;
+ findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+ // For any definition of the src2 register which is non-MFMA, we
+ // insert a copy.
+ for (SlotIndex RDIdx : Src2ReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+ if (!TII->isMAI(*RD))
+ CopyForDef.insert(RD);
+ }
+ }
+
+ MachineOperand &Dst = MI.getOperand(0);
+ SmallVector<MachineOperand *, 8> DstReachingUses;
+
+ findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+ for (MachineOperand *RUOp : DstReachingUses) {
+ if (TII->isMAI(*RUOp->getParent()))
+ continue;
+
+ // For any user of the result of the MFMA which is not an MFMA, we
+ // insert a copy. For a given register, we will only insert one copy
+ // per user block.
+ CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+ SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+ findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+ for (auto RDIndex : DstUsesReachingDefs) {
+ MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+ if (TII->isMAI(*RD))
+ continue;
+
+ // For any definition of the user of the MFMA which is not an MFMA,
+ // we insert a copy. We do this to transform all the reaching defs
+ // of this use to AGPR. By doing this, we can insert a copy from
+ // AGPR to VGPR at the user rather than after the MFMA.
+ CopyForDef.insert(RD);
+ }
+ }
+
+ // Do the rewrite to allow for updated RP calculation.
+ const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+ DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+ if (Src2->isReg())
+ DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+ }
+ }
+ }
+
+ return true;
+}
+
+int64_t RewriteScheduleStage::getRewriteCost(
+ std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+ DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+ SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+ MBFI.calculate(MF, MBPI, *DAG.MLI);
+ int64_t BestSpillCost = 0;
+ int64_t Cost = 0;
+
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+ if (!RegionsWithExcessArchVGPR[Region])
+ continue;
+
+ auto PressureBefore = DAG.Pressure[Region];
+ unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+
+ // For the cases we care about (i.e. ArchVGPR usage is greater than the
+ // addressable limit), rewriting alone should bring pressure to manageable
+ // level. If we find any such region, then the rewrite is potentially
+ // beneficial.
+ auto PressureAfter = DAG.getRealRegPressure(Region);
+ unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+
+ uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+ uint64_t BlockFreq =
+ MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+ .getFrequency();
+
+ bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+ uint64_t RelativeFreq = EntryFreq && BlockFreq
+ ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+ : BlockFreq / EntryFreq)
+ : 1;
+
+ // This assumes perfect spilling / splitting -- using one spill / copy
+ // instruction and one restoreFrom / copy for each excess register,
+ int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+ // Also account for the block frequency.
+ if (RelativeFreqIsDenom)
+ SpillCost /= (int64_t)RelativeFreq;
+ else
+ SpillCost *= (int64_t)RelativeFreq;
+
+ // If we have increased spilling in any block, just bail.
+ if (SpillCost > 0)
+ return SpillCost;
+
+ if (SpillCost < BestSpillCost)
+ BestSpillCost = SpillCost;
+ }
+
+ // Set the cost to the largest decrease in spill cost in order to not double
+ // count spill reductions.
+ Cost = BestSpillCost;
+
+ assert(Cost <= 0);
+
+ unsigned CopyCost = 0;
+
+ uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
+ // For each CopyForDef, increase the cost by the register size while
+ // accounting for block frequency.
+ for (auto *DefMI : CopyForDef) {
+ auto DefReg = DefMI->getOperand(0).getReg();
+ uint64_t DefFreq =
+ EntryFreq
+ ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+ : 1;
+
+ unsigned RegSize = DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(DefReg));
+ unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+ CopyCost += NumRegs * DefFreq;
+ }
+
+ // Account for CopyForUse copies in each block that the register is used.
+ for (auto &UseEntry : CopyForUse) {
+ uint64_t UseFreq =
+ EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
+ : 1;
+
+ for (auto UseReg : UseEntry.second) {
+ unsigned RegSize =
+ DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
+ unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+ CopyCost += NumRegs * UseFreq;
+ }
+ }
+
+ Cost += CopyCost;
+
+ // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
+ // defs of the register may require VGPR.
+ for (auto RI : RewriteCands) {
+ MachineInstr *MI = RI.first;
+
+ assert(TII->isMAI(*MI));
+ const TargetRegisterClass *AGPRRC =
+ DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+ MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ assert(Src2);
+
+ if (Src2->isReg()) {
+ DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+ }
+ DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+ MI->setDesc(TII->get(RI.second));
+ }
+
+ return Cost;
+}
+
+bool RewriteScheduleStage::rewrite(
+ std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+ DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+ DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+ for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+ auto Entry = DAG.Regions[Region];
+ if (Entry.first == Entry.second)
+ continue;
+
+ FirstMIToRegion[&*Entry.first] = Region;
+ if (Entry.second != Entry.first->getParent()->end())
+ LastMIToRegion[&*Entry.second] = Region;
+ }
+
+ // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+ // The general assumption of the algorithm (and the previous cost calculation)
+ // is that it is better to insert the copies in the MBB of the def of the src2
+ // operands, and in the MBB of the user of the dest operands. This is based on
+ // the assumption that the MFMAs are likely to appear in loop bodies, while
+ // the src2 and dest operands are live-in / live-out of the loop. Due to this
+ // design, the algorithm for finding copy insertion points is more
+ // complicated.
+ //
+ // There are three main cases to handle: 1. the reaching defs of the src2
+ // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+ // defs of the reaching uses of the dst operand.
+ //
+ // In the first case, we simply insert copies after each of the reaching
+ // definitions. In the second case, we collect all the uses of a given dest
+ // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+ // earliest use. Since the use may have multiple reaching defs, and since we
+ // want to replace the register it is using with the result of the copy, we
+ // must handle case 3. In the third case, we simply insert a copy after each
+ // of the reaching defs to connect to the copy of the reaching uses of the dst
+ // reg. This allows us to avoid inserting copies next to the' MFMAs.
----------------
lucas-rami wrote:
```suggestion
// reg. This allows us to avoid inserting copies next to the MFMAs.
```
https://github.com/llvm/llvm-project/pull/149367
More information about the llvm-commits
mailing list