[llvm] [AMDGPU][Scheduler] Refactor VGPR rematerialization during scheduling (PR #118722)

Fri Jan 31 10:56:22 PST 2025

================
@@ -1467,284 +1488,460 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-void PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
-  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-    if (!DAG.LIS->hasInterval(Reg))
-      continue;
-
-    // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-      continue;
 
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
-    MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
-      continue;
-
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-    if (Def->getParent() == UseI->getParent())
+  RA_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+                  << MF.getFunction().getName() << '\n');
+
+  // Models excess register pressure in a region.
+  struct ExcessRP {
+    // Amount of VGPR spilling, if any.
+    unsigned Spilling = 0;
+    // Number of VGPRs to save to increase occupancy. std::nullopt when
+    // occupancy is SGPR-limited.
+    std::optional<unsigned> Occupancy = std::nullopt;
+  };
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+  // occupancy, or regions with VGPR spilling) to their excess RP.
+  DenseMap<unsigned, ExcessRP> OptRegions;
+
+  // Collect optimizable regions.
+  unsigned NumRegionsSpilling = 0;
+  bool CanIncreaseOccupancy = true;
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    if (!DAG.RegionsWithMinOcc[I])
       continue;
+    GCNRegPressure &RP = DAG.Pressure[I];
+
+    unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
+    ExcessRP Excess;
+    if ((Excess.Spilling = ST.getNumVGPRsToEliminateSpilling(NumVGPRs))) {
+      // Region has VGPR spilling, we may not be able to increase occupancy but
+      // we can at least try to reduce spilling as much as possible.
+      ++NumRegionsSpilling;
+      RA_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
+                      << Excess.Spilling << " VGPR(s) to eliminate spilling\n");
+    }
+    if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) != DAG.MinOccupancy) {
+      // Occupancy is VGPR-limited. If occupancy is minimal and there is
+      // spilling, the number of registers to save to increase occupancy
+      // includes the number of spilled registers to save; deduct the latter
+      // from the former to only get the number of registers to save to
+      // increase occupancy as if there was no spilling.
+      Excess.Occupancy =
+          ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs) - Excess.Spilling;
+      RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
+                      << Excess.Occupancy << " VGPR(s) to improve occupancy\n");
+    } else {
+      CanIncreaseOccupancy = false;
+    }
+    if (Excess.Spilling || Excess.Occupancy)
+      OptRegions.insert({I, Excess});
+  }
+  if (OptRegions.empty())
+    return false;
 
-    // We are only collecting defs that are defined in another block and are
-    // live-through or used inside regions at MinOccupancy. This means that the
-    // register must be in the live-in set for the region.
-    bool AddedToRematList = false;
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      auto It = DAG.LiveIns[I].find(Reg);
-      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
-        if (DAG.RegionsWithMinOcc[I]) {
-          RematerializableInsts[I][Def] = UseI;
-          AddedToRematList = true;
+  bool FuncHasSpilling = NumRegionsSpilling != 0;
+  unsigned RematSpillingCutoff = 0;
+  TargetOcc = DAG.MinOccupancy + (CanIncreaseOccupancy ? 1 : 0);
+
+  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+  // estimate that we have identified enough rematerialization opportunities to
+  // increase function occupancy.
+  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
+    auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    unsigned I = OptIt->getFirst();
+    ExcessRP &Excess = OptIt->getSecond();
+
+    // While there is spilling, saved registers only serve to reduce it.
+    if (Excess.Spilling) {
+      unsigned Reduction = std::min(NumRegs, Excess.Spilling);
+      Excess.Spilling -= Reduction;
+      NumRegs -= Reduction;
+      if (!Excess.Spilling) {
+        // We have eliminated spilling in the region.
+        LLVM_DEBUG(dbgs() << "sinking eliminates spilling in region " << I
+                          << '\n');
+        if (--NumRegionsSpilling)
+          RematSpillingCutoff = Rematerializations.size();
+        if (!Excess.Occupancy) {
+          // Occupancy cannot be increased, so we are done with this region.
+          OptRegions.erase(I);
+          return OptRegions.empty();
         }
-
-        // Collect regions with rematerializable reg as live-in to avoid
-        // searching later when updating RP.
-        RematDefToLiveInRegions[Def].push_back(I);
+      } else {
+        LLVM_DEBUG(dbgs() << "sinking reduces spilling in region " << I
+                          << " by " << Reduction << '\n');
       }
     }
-    if (!AddedToRematList)
-      RematDefToLiveInRegions.erase(Def);
-  }
-}
-
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
-  // Temporary copies of cached variables we will be modifying and replacing if
-  // sinking succeeds.
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      NewRegions;
-  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
-  DenseMap<unsigned, GCNRegPressure> NewPressure;
-  BitVector NewRescheduleRegions;
-  LiveIntervals *LIS = DAG.LIS;
-
-  NewRegions.resize(DAG.Regions.size());
-  NewRescheduleRegions.resize(DAG.Regions.size());
-
-  // Collect only regions that has a rematerializable def as a live-in.
-  SmallSet<unsigned, 16> ImpactedRegions;
-  for (const auto &It : RematDefToLiveInRegions)
-    ImpactedRegions.insert(It.second.begin(), It.second.end());
-
-  // Make copies of register pressure and live-ins cache that will be updated
-  // as we rematerialize.
-  for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = DAG.Pressure[Idx];
-    NewLiveIns[Idx] = DAG.LiveIns[Idx];
-  }
-  NewRegions = DAG.Regions;
-  NewRescheduleRegions.reset();
-
-  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
-  bool Improved = false;
-  for (auto I : ImpactedRegions) {
-    if (!DAG.RegionsWithMinOcc[I])
-      continue;
 
-    Improved = false;
-    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
-    int SGPRUsage = NewPressure[I].getSGPRNum();
+    // Once spilling has been eliminated, saved registers serve to increase
+    // occupancy.
+    if (NumRegs && Excess.Occupancy) {
+      if (NumRegs >= *Excess.Occupancy) {
+        OptRegions.erase(I);
+        LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+                          << '\n');
+      } else {
+        *Excess.Occupancy -= NumRegs;
+        LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+                          << " by " << NumRegs << '\n');
+      }
+    }
+    return OptRegions.empty();
+  };
+
+  // We need up-to-date live-out info. to query live-out register masks in
+  // regions containing rematerializable instructions.
+  DAG.RegionLiveOuts.buildLiveRegMap();
+
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI) {
+      // The instruction must be trivially rematerializable.
+      MachineInstr &DefMI = *MI;
+      if (!isTriviallyReMaterializable(DefMI))
+        continue;
 
-    // TODO: Handle occupancy drop due to AGPR and SGPR.
-    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
-      break;
+      // We only support rematerializing virtual VGPRs with one definition.
+      Register Reg = DefMI.getOperand(0).getReg();
+      if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+          !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+          !DAG.MRI.hasOneDef(Reg))
+        continue;
 
-    // The occupancy of this region could have been improved by a previous
-    // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
-      NewRescheduleRegions[I] = true;
-      Improved = true;
-      continue;
-    }
+      // We only care to rematerialize the instruction if it has a single
+      // non-debug user in a different block.
+      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+      if (!UseMI || DefMI.getParent() == UseMI->getParent())
+        continue;
 
-    // First check if we have enough trivially rematerializable instructions to
-    // improve occupancy. Optimistically assume all instructions we are able to
-    // sink decreased RP.
-    int TotalSinkableRegs = 0;
-    for (const auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      Register DefReg = Def->getOperand(0).getReg();
-      TotalSinkableRegs +=
-          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
-    }
-    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
-    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
-    // If in the most optimistic scenario, we cannot improve occupancy, then do
-    // not attempt to sink any instructions.
-    if (OptimisticOccupancy <= DAG.MinOccupancy)
-      break;
+      RA_DEBUG(dbgs() << "In region " << I << ", rematerializable instruction ";
+               DefMI.print(dbgs(), true, false, false, false);
+               dbgs() << " with single use " << *UseMI);
+      auto &Remat = Rematerializations.emplace_back(&DefMI, I, UseMI);
+
+      bool RematUseful = false;
+      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+        // Optimistically consider that moving the instruction out of its
+        // defining region will reduce RP in the latter; this assumes that
+        // maximum RP in the region is reached somewhere between the defining
+        // instruction and the end of the region.
+        RA_DEBUG(dbgs() << "  Instruction's defining region is optimizable: ");
+        RematUseful = true;
+        LaneBitmask RegMask =
+            DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+        if (ReduceRPInRegion(It, RegMask))
+          return true;
+      }
 
-    unsigned ImproveOccupancy = 0;
-    SmallVector<MachineInstr *, 4> SinkedDefs;
-    for (auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      MachineBasicBlock::iterator InsertPos =
-          MachineBasicBlock::iterator(It.second);
-      Register Reg = Def->getOperand(0).getReg();
-      // Rematerialize MI to its use block. Since we are only rematerializing
-      // instructions that do not have any virtual reg uses, we do not need to
-      // call LiveRangeEdit::allUsesAvailableAt() and
-      // LiveRangeEdit::canRematerializeAt().
-      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
-      MachineInstr *NewMI = &*std::prev(InsertPos);
-      LIS->InsertMachineInstrInMaps(*NewMI);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
-      InsertedMIToOldDef[NewMI] = Def;
-
-      // Update region boundaries in scheduling region we sinked from since we
-      // may sink an instruction that was at the beginning or end of its region
-      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
-                                 /*Removing =*/true);
-
-      // Update region boundaries in region we sinked to.
-      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
-      LaneBitmask PrevMask = NewLiveIns[I][Reg];
-      // FIXME: Also update cached pressure for where the def was sinked from.
-      // Update RP for all regions that has this reg as a live-in and remove
-      // the reg from all regions as a live-in.
-      for (auto Idx : RematDefToLiveInRegions[Def]) {
-        NewLiveIns[Idx].erase(Reg);
-        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
-          // Def is live-through and not used in this block.
-          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
+        // We are only collecting regions in which the register is a live-in
+        // (and may be live-through).
+        auto It = DAG.LiveIns[LIRegion].find(Reg);
+        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
+          continue;
+        Remat.LiveInRegions.insert(LIRegion);
+        RA_DEBUG(dbgs() << "  Def is live-in in region " << LIRegion << ": ");
+
+        // Account for the reduction in RP due to the rematerialization in an
+        // optimizable region in which the defined register is a live-in. This
+        // is exact for live-through region but optimistic in the using region,
+        // where RP is actually reduced only if maximum RP is reached somewhere
+        // between the beginning of the region and the rematerializable
+        // instruction's use.
+        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
+          RematUseful = true;
+          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg]))
+            return true;
         } else {
-          // Def is used and rematerialized into this block.
-          GCNDownwardRPTracker RPT(*LIS);
-          auto *NonDbgMI = &*skipDebugInstructionsForward(
-              NewRegions[Idx].first, NewRegions[Idx].second);
-          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
-          RPT.advance(NewRegions[Idx].second);
-          NewPressure[Idx] = RPT.moveMaxPressure();
+          LLVM_DEBUG(dbgs() << "unoptimizable region\n");
         }
       }
 
-      SinkedDefs.push_back(Def);
-      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
-      if (ImproveOccupancy > DAG.MinOccupancy)
-        break;
+      // If the instruction is not a live-in or live-out in any optimizable
+      // region then there is no point in rematerializing it.
+      if (!RematUseful) {
+        Rematerializations.pop_back();
+        RA_DEBUG(
+            dbgs()
+            << "  No impact on any optimizable region, dropping instruction\n");
+      }
     }
-
-    // Remove defs we just sinked from all regions' list of sinkable defs
-    for (auto &Def : SinkedDefs)
-      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
-        RematerializableInsts[TrackedIdx].erase(Def);
-
-    if (ImproveOccupancy <= DAG.MinOccupancy)
-      break;
-
-    NewRescheduleRegions[I] = true;
-    Improved = true;
   }
 
-  if (!Improved) {
-    // Occupancy was not improved for all regions that were at MinOccupancy.
-    // Undo sinking and remove newly rematerialized instructions.
-    for (auto &Entry : InsertedMIToOldDef) {
-      MachineInstr *MI = Entry.first;
-      MachineInstr *OldMI = Entry.second;
-      Register Reg = MI->getOperand(0).getReg();
-      LIS->RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-      OldMI->clearRegisterDeads(Reg);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
+  if (FuncHasSpilling && !Rematerializations.empty()) {
+    // We won't be able to increase occupancy, but we still want to reduce
+    // spilling. Drop any instruction we collected to increase occupancy.
+    TargetOcc = DAG.MinOccupancy;
+    if (!NumRegionsSpilling) {
+      Rematerializations.truncate(RematSpillingCutoff);
+      RA_DEBUG(dbgs() << "Can only eliminate spilling\n");
+    } else {
+      RA_DEBUG(dbgs() << "Can only reduce spilling\n");
     }
-    return false;
-  }
-
-  // Occupancy was improved for all regions.
-  for (auto &Entry : InsertedMIToOldDef) {
-    MachineInstr *MI = Entry.first;
-    MachineInstr *OldMI = Entry.second;
-
-    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
-    DAG.BBLiveInMap.erase(OldMI);
-
-    // Remove OldMI and update LIS
-    Register Reg = MI->getOperand(0).getReg();
-    LIS->RemoveMachineInstrFromMaps(*OldMI);
-    OldMI->eraseFromParent();
-    LIS->removeInterval(Reg);
-    LIS->createAndComputeVirtRegInterval(Reg);
+    return true;
   }
+  RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
+  Rematerializations.clear();
+  return false;
+}
 
-  // Update live-ins, register pressure, and regions caches.
-  for (auto Idx : ImpactedRegions) {
-    DAG.LiveIns[Idx] = NewLiveIns[Idx];
-    DAG.Pressure[Idx] = NewPressure[Idx];
-    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+void PreRARematStage::rematerialize() {
+  const auto *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // Collect regions whose RP changes in unpredictable way; we will have to
+  // fully recompute their RP after all rematerailizations.
+  DenseSet<unsigned> RecomputeRP;
+  SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
+
+  // Rematerialize all instructions.
+  for (RematInstruction &Remat : Rematerializations) {
+    MachineInstr &DefMI = *Remat.DefMI;
+    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+    Register Reg = DefMI.getOperand(0).getReg();
+    unsigned SubReg = DefMI.getOperand(0).getSubReg();
+
+    // Rematerialize DefMI to its use block. Since we are only rematerializing
+    // instructions that do not have any virtual reg uses, we do not need to
+    // call LiveRangeEdit::allUsesAvailableAt() and
+    // LiveRangeEdit::canRematerializeAt().
+    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, DefMI,
+                       *DAG.TRI);
+    Remat.RematMI = &*std::prev(InsertPos);
+    Remat.RematMI->getOperand(0).setSubReg(SubReg);
+    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+
+    // Update region boundaries in regions we sinked from (remove defining MI)
+    // and to (insert MI rematerialized in use block). Only then we can erase
+    // the original MI.
+    DAG.updateRegionBoundaries(DAG.Regions, DefMI, nullptr);
+    DAG.updateRegionBoundaries(DAG.Regions, InsertPos, Remat.RematMI);
+    DefMI.eraseFromParent();
+    DAG.LIS->RemoveMachineInstrFromMaps(DefMI);
+
+    // Collect all regions impacted by the rematerialization and update their
+    // live-in/RP information.
+    for (unsigned I : Remat.LiveInRegions) {
+      ImpactedRegions.insert({I, DAG.Pressure[I]});
+      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+
+      // The register is no longer a live-in in all regions but the one that
+      // contains the single use. In live-through regions, maximum register
+      // pressure decreases predictably so we can directly update it. In the
+      // using region, maximum RP may or may not decrease, so we will mark it
+      // for re-computation after all materializations have taken place.
+      RegionLiveIns.erase(Reg);
+      LaneBitmask PrevMask = RegionLiveIns[Reg];
----------------
jrbyrnes wrote:

Accessing the element after erasing?

https://github.com/llvm/llvm-project/pull/118722