[llvm] [AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (PR #125885)

Lucas Ramirez via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 18 07:31:03 PDT 2025


================
@@ -1673,174 +1682,333 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
   return true;
 }
 
-void PreRARematStage::collectRematerializableInstructions() {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
-  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-    if (!DAG.LIS->hasInterval(Reg))
-      continue;
-
-    // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-      continue;
+bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
+                                     unsigned MaxVGPRs,
+                                     unsigned &ExcessArchVGPRs,
+                                     bool &AGPRLimited) {
+  unsigned NumAGPRs = RP.getAGPRNum();
+  if (!ST.hasGFX90AInsts() || !NumAGPRs) {
+    // Non-unified RF. We can only reduce ArchVGPR excess pressure at this
+    // point, but still want to identify when there is AGPR excess pressure.
+    bool HasSpill = false;
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumArchVGPRs > MaxVGPRs) {
+      ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
+      HasSpill = true;
+    }
+    if (NumAGPRs > MaxVGPRs) {
+      ExcessArchVGPRs = NumArchVGPRs;
+      AGPRLimited = true;
+      HasSpill = true;
+    }
+    return HasSpill;
+  }
+  if (RP.getVGPRNum(true) > MaxVGPRs) {
+    // Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
+    // us from eliminating spilling.
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumAGPRs >= MaxVGPRs) {
+      AGPRLimited = true;
+      ExcessArchVGPRs = NumArchVGPRs;
+    } else {
+      ExcessArchVGPRs = NumArchVGPRs - alignDown(MaxVGPRs - NumAGPRs, 4);
+    }
+    return true;
+  }
+  return false;
+}
 
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
-    MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
-      continue;
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-    if (Def->getParent() == UseI->getParent())
-      continue;
+  REMAT_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+                     << MF.getFunction().getName() << '\n');
+
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+  // occupancy, or regions with VGPR spilling) to their excess RP.
+  DenseMap<unsigned, unsigned> OptRegions;
+  const Function &F = MF.getFunction();
+  const bool UnifiedRF = ST.hasGFX90AInsts();
+
+  // Adjust workgroup size induced occupancy bounds with the
+  // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
+  // method, but at this point is if unclear how other parts of the codebase
+  // interpret this attribute and the default behavior produces unexpected
+  // bounds. Here we want to allow users to ask for target occupancies lower
+  // than the default lower bound.
+  std::pair<unsigned, unsigned> OccBounds =
+      ST.getOccupancyWithWorkGroupSizes(MF);
+  std::pair<unsigned, unsigned> WavesPerEU =
+      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
+  if (WavesPerEU.first <= WavesPerEU.second) {
+    if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
+      OccBounds.first = WavesPerEU.first;
+    if (WavesPerEU.second)
+      OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
+  }
 
-    bool HasRematDependency = false;
-    // Check if this instruction uses any registers that are planned to be
-    // rematerialized
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    for (MachineOperand &MO : Def->operands()) {
-                      if (!MO.isReg())
-                        continue;
-                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
-                        return true;
-                    }
-                    return false;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
-      }
+  // We call the "base max functions" directly because otherwise it uses the
+  // subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
+  // groupsize induced occupancy bounds, producing unexpected results.
+  const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
+      F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
+  const unsigned MaxVGPRsNoSpill =
+      ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
+                                ST.getMaxNumVGPRs(OccBounds.first)});
+  const unsigned MaxSGPRsIncOcc =
+      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
+  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+  IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
+
+  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+    if (Cond) {
+      // We won't try to increase occupancy.
+      IncreaseOccupancy = false;
+      OptRegions.clear();
     }
-    // Do not rematerialize an instruction if it uses an instruction that we
-    // have designated for rematerialization.
-    // FIXME: Allow for rematerialization chains: this requires 1. updating
-    // remat points to account for uses that are rematerialized, and 2. either
-    // rematerializing the candidates in careful ordering, or deferring the MBB
-    // RP walk until the entire chain has been rematerialized.
-    if (HasRematDependency)
-      continue;
-
-    // Similarly, check if the UseI is planned to be remat.
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    return Remat.first == UseI;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
+    return Cond;
+  };
+
+  // Collect optimizable regions. If there is spilling in any region we will
+  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+  // occupancy by one in the whole function.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    GCNRegPressure &RP = DAG.Pressure[I];
+    unsigned ExcessRP = 0;
+    unsigned NumSGPRs = RP.getSGPRNum();
+
+    // Check whether SGPR pressures prevents us from eliminating spilling.
+    if (NumSGPRs > MaxSGPRsNoSpill)
+      ClearOptRegionsIf(IncreaseOccupancy);
+
+    bool OccAGPRLimited = false;
+    if (hasExcessVGPRs(RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
+      ClearOptRegionsIf(IncreaseOccupancy);
+      REMAT_DEBUG({
+        if (ExcessRP) {
+          StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+          dbgs() << "Region " << I << " is spilling " << RegClass << ", save "
+                 << ExcessRP << " to eliminate " << RegClass << "-spilling\n";
+        }
+      });
+    } else if (IncreaseOccupancy) {
+      // Check whether SGPR pressure prevents us from increasing occupancy.
+      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
+        if (DAG.MinOccupancy >= OccBounds.first)
+          return false;
+        continue;
       }
-    }
-
-    if (HasRematDependency)
-      break;
 
-    // We are only collecting defs that are defined in another block and are
-    // live-through or used inside regions at MinOccupancy. This means that the
-    // register must be in the live-in set for the region.
-    bool AddedToRematList = false;
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      auto It = DAG.LiveIns[I].find(Reg);
-      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
-        if (DAG.RegionsWithMinOcc[I]) {
-          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
-          SlotIndex UseIdx =
-              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
-          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
-            RematerializableInsts[I][Def] = UseI;
-            AddedToRematList = true;
-          }
+      if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
+        // Check whether AGPR pressure prevents us from increasing occupancy.
+        if (ClearOptRegionsIf(OccAGPRLimited)) {
+          if (DAG.MinOccupancy >= OccBounds.first)
+            return false;
+          continue;
         }
 
-        // Collect regions with rematerializable reg as live-in to avoid
-        // searching later when updating RP.
-        RematDefToLiveInRegions[Def].push_back(I);
+        // Occupancy could be increased by rematerializing ArchVGPRs.
+        REMAT_DEBUG({
+          if (ExcessRP) {
+            StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+            dbgs() << "Region " << I << " has min. occupancy: save " << ExcessRP
+                   << " " << RegClass << " to improve occupancy\n";
+          }
+        });
       }
     }
-    if (!AddedToRematList)
-      RematDefToLiveInRegions.erase(Def);
+    if (ExcessRP)
+      OptRegions.insert({I, ExcessRP});
   }
-}
+  if (OptRegions.empty())
+    return false;
 
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
-  // Temporary copies of cached variables we will be modifying and replacing if
-  // sinking succeeds.
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      NewRegions;
-  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
-  DenseMap<unsigned, GCNRegPressure> NewPressure;
-  BitVector NewRescheduleRegions;
-  LiveIntervals *LIS = DAG.LIS;
+  // When we are reducing spilling, the target is the minimum achievable
+  // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
+  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
+
+  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+  // estimate that we have identified enough rematerialization opportunities to
+  // achieve our goal.
+  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
+    auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    unsigned I = OptIt->getFirst();
+    unsigned &Excess = OptIt->getSecond();
+    if (NumRegs >= Excess)
+      OptRegions.erase(I);
+    else
+      Excess -= NumRegs;
+    return OptRegions.empty();
+  };
+
+  // We need up-to-date live-out info. to query live-out register masks in
+  // regions containing rematerializable instructions.
+  DAG.RegionLiveOuts.buildLiveRegMap();
+
+  // Cache set of registers that are going to be rematerialized.
+  DenseSet<unsigned> RematRegs;
+
+  // Identify rematerializable instructions in the function.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI) {
+      // The instruction must be trivially rematerializable.
+      MachineInstr &DefMI = *MI;
+      if (!isTriviallyReMaterializable(DefMI))
+        continue;
 
-  NewRegions.resize(DAG.Regions.size());
-  NewRescheduleRegions.resize(DAG.Regions.size());
+      // We only support rematerializing virtual VGPRs with one definition.
+      Register Reg = DefMI.getOperand(0).getReg();
+      if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+          !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+          !DAG.MRI.hasOneDef(Reg))
+        continue;
 
-  // Collect only regions that has a rematerializable def as a live-in.
-  SmallSet<unsigned, 16> ImpactedRegions;
-  for (const auto &It : RematDefToLiveInRegions)
-    ImpactedRegions.insert(It.second.begin(), It.second.end());
+      // We only care to rematerialize the instruction if it has a single
+      // non-debug user in a different block.
+      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+      if (!UseMI || DefMI.getParent() == UseMI->getParent())
----------------
lucas-rami wrote:

Yes we can easily do that now indeed. Rollbacking is "region-based" already so this doesn't require any change there. 

https://github.com/llvm/llvm-project/pull/125885


More information about the llvm-commits mailing list