[llvm] [SystemZ] Add a SystemZ specific pre-RA scheduling strategy. (PR #135076)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 21 06:16:52 PDT 2025
================
@@ -5,22 +5,421 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// -------------------------- Post RA scheduling ---------------------------- //
-// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
-// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
-// implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources. Scheduler states are saved for the end
-// region of each MBB, so that a successor block can learn from it.
-//===----------------------------------------------------------------------===//
#include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
+/// Pre-RA scheduling ///
+
+static cl::opt<unsigned> TinyRegionLim(
+ "tiny-region-lim", cl::Hidden, cl::init(10),
+ cl::desc("Run limited pre-ra scheduling on regions of this size or "
+ "smaller. Mainly for testing."));
+
+static bool isRegDef(const MachineOperand &MO) {
+ return MO.isReg() && MO.isDef();
+}
+
+static bool isVirtRegDef(const MachineOperand &MO) {
+ return isRegDef(MO) && MO.getReg().isVirtual();
+}
+
+static bool isPhysRegDef(const MachineOperand &MO) {
+ return isRegDef(MO) && MO.getReg().isPhysical();
+}
+
+static bool isVirtRegUse(const MachineOperand &MO) {
+ return MO.isReg() && MO.isUse() && MO.readsReg() && MO.getReg().isVirtual();
+}
+
+void SystemZPreRASchedStrategy::initializePrioRegClasses(
+ const TargetRegisterInfo *TRI) {
+ for (const TargetRegisterClass *RC : TRI->regclasses()) {
+ for (MVT VT : MVT::fp_valuetypes())
+ if (TRI->isTypeLegalForClass(*RC, VT)) {
+ PrioRegClasses.insert(RC->getID());
+ break;
+ }
+
+ // On SystemZ vector and FP registers overlap: add any vector RC.
+ if (!PrioRegClasses.count(RC->getID()))
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (TRI->isTypeLegalForClass(*RC, VT)) {
+ PrioRegClasses.insert(RC->getID());
+ break;
+ }
+ }
+}
+
+void SystemZPreRASchedStrategy::VRegSet::dump(std::string Msg) {
+ dbgs() << Msg.c_str();
+ bool First = true;
+ for (auto R : *this) {
+ if (!First)
+ dbgs() << ", ";
+ else
+ First = false;
+ dbgs() << "%" << R.virtRegIndex();
+ }
+ dbgs() << "\n";
+}
+
+unsigned SystemZPreRASchedStrategy::getRemLat(SchedBoundary *Zone) const {
+ if (RemLat == ~0U)
+ RemLat = computeRemLatency(*Zone);
+ return RemLat;
+}
+
+void SystemZPreRASchedStrategy::initializeStoresGroup() {
+ StoresGroup.clear();
+ FirstStoreInGroupScheduled = false;
+
+ unsigned CurrMaxDepth = 0;
+ for (unsigned Idx = DAG->SUnits.size() - 1; Idx + 1 != 0; --Idx) {
+ const SUnit *SU = &DAG->SUnits[Idx];
+ const MachineInstr *MI = SU->getInstr();
+ if (!MI->getNumOperands() || MI->isCopy())
+ continue;
+
+ bool HasVirtDef = false;
+ bool HasVirtUse = false;
+ for (unsigned I = 0; I < MI->getDesc().getNumOperands(); ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ if (isVirtRegDef(MO) && !MO.isDead())
+ HasVirtDef = true;
+ else if (isVirtRegUse(MO) &&
+ MI->getDesc().operands()[I].OperandType != MCOI::OPERAND_MEMORY)
+ HasVirtUse = true;
+ }
+ bool IsStore = !HasVirtDef && HasVirtUse;
+
+ // Find a group of stores that all are at the bottom while avoiding
+ // regions with any additional group of lesser depth.
+ if (SU->getDepth() > CurrMaxDepth) {
+ CurrMaxDepth = SU->getDepth();
+ bool PrevGroup = StoresGroup.size() > 1;
+ StoresGroup.clear();
+ if (PrevGroup)
+ return;
+ if (IsStore)
+ StoresGroup.insert(SU);
+ }
+ else if (IsStore && !StoresGroup.empty() && SU->getDepth() == CurrMaxDepth) {
+ // The group members should all have the same opcode.
+ if ((*StoresGroup.begin())->getInstr()->getOpcode() != MI->getOpcode()) {
+ StoresGroup.clear();
+ return;
+ }
+ StoresGroup.insert(SU);
+ }
+ }
+
+ // Value of 8 handles a known regression (with group of 20).
+ // TODO: Would some other value be better?
+ if (StoresGroup.size() < 8)
+ StoresGroup.clear();
+}
+
+static int biasPhysRegExtra(const SUnit *SU) {
+ if (int Res = biasPhysReg(SU, /*isTop=*/false))
+ return Res;
+
+ // Also recognize Load Address of stack slot. There are (at least
+ // currently) no instructions here defining a physreg that uses a vreg.
+ const MachineInstr *MI = SU->getInstr();
+ if (MI->getNumOperands() && !MI->isCopy()) {
+ const MachineOperand &DefMO = MI->getOperand(0);
+ if (isPhysRegDef(DefMO))
+ return 1;
+ }
+
+ return 0;
+}
+
+int SystemZPreRASchedStrategy::
+computeSULivenessScore(SchedCandidate &C, ScheduleDAGMILive *DAG,
+ SchedBoundary *Zone) const {
+ // Not all data deps are modelled around the SUnit - some data edges near
+ // boundaries are missing: Look directly at the MI operands instead.
+ const SUnit *SU = C.SU;
+ const MachineInstr *MI = SU->getInstr();
+ if (!MI->getNumOperands() || MI->isCopy())
+ return 0;
+
+ // Find uses of registers that are not already live (kills).
+ bool PrioKill = false;
+ bool GPRKill = false;
+ bool AddrKill = false;
+ bool HasPrioUse = false;
+ for (unsigned I = 0; I < MI->getDesc().getNumOperands(); ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ if (!isVirtRegUse(MO))
+ continue;
+ HasPrioUse |= isPrioVirtReg(MO.getReg(), &DAG->MRI);
+ if (LiveRegs.count(MO.getReg()))
+ continue;
+ if (isPrioVirtReg(MO.getReg(), &DAG->MRI))
+ PrioKill = true;
+ else if (MI->getDesc().operands()[I].OperandType != MCOI::OPERAND_MEMORY)
+ GPRKill = true;
+ else
+ AddrKill = true;
+ }
+
+ // Find the interesting properties.
+ const MachineOperand &DefMO = MI->getOperand(0);
+ assert(!isPhysRegDef(DefMO) && "Did not expect physreg def!");
+ bool IsLoad =
+ isRegDef(DefMO) && !DefMO.isDead() && !IsRedefining[SU->NodeNum];
+ bool IsStore = (!isRegDef(DefMO) || DefMO.isDead());
+ // Prioritize FP: Ignore GPR/Addr kills with an FP def.
+ bool UsesLivePrio =
+ IsLoad && !PrioKill &&
+ (isPrioVirtReg(DefMO.getReg(), &DAG->MRI) || (!GPRKill && !AddrKill));
+ bool UsesLiveAll = !PrioKill && !GPRKill && !AddrKill;
+ bool PreservesSchedLat = SU->getHeight() <= Zone->getScheduledLatency();
+ const unsigned Cycles = 2;
+ unsigned Margin = SchedModel->getIssueWidth() * (Cycles + SU->Latency - 1);
+ bool HasDistToTop = NumLeft > Margin;
+
+ // Pull down a defining SU if it preserves the scheduled latency while not
+ // causing any (prioritized) register uses to become live. If however there
+ // will be relatively many SUs scheduled above this one and all uses are
+ // already live it should not be a problem to increase the scheduled
+ // latency given the OOO execution.
+ // TODO: Try schedulling small (DFSResult) subtrees as a unit.
+ bool SchedLow = IsLoad && ((PreservesSchedLat && UsesLivePrio) ||
+ (HasDistToTop && UsesLiveAll));
+
+ // This handles regions with many chained stores of the same depth at the
+ // bottom in the input order (cactus). Push them upwards during scheduling.
+ bool SchedHigh = IsStore && FirstStoreInGroupScheduled &&
+ StoresGroup.count(SU) &&
+ (PrioKill || (!HasPrioUse && GPRKill));
+
+ if (SchedLow)
+ return -1;
+ if (SchedHigh)
+ return 1;
+ return 0;
+}
+
+bool SystemZPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ assert(Zone && !Zone->isTop() && "Bottom-Up scheduling only.");
+
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = FirstValid;
+ return true;
+ }
+
+ // Bias physreg defs and copies to their uses and definitions respectively.
+ int TryCandPRegBias = biasPhysRegExtra(TryCand.SU);
+ int CandPRegBias = biasPhysRegExtra(Cand.SU);
+ if (tryGreater(TryCandPRegBias, CandPRegBias, TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+ if (TryCandPRegBias && CandPRegBias) {
+ // Both biased same way.
+ tryGreater(TryCand.SU->NodeNum, Cand.SU->NodeNum, TryCand, Cand, NodeOrder);
+ return TryCand.Reason != NoCand;
+ }
+
+ if (TinyRegion) {
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ // TODO: Try this in bigger regions as well.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ } else {
+ // Look for an opportunity to reduce register liveness.
+ int TryCandScore = computeSULivenessScore(TryCand, DAG, Zone);
+ int CandScore = computeSULivenessScore(Cand, DAG, Zone);
+ if (tryLess(TryCandScore, CandScore, TryCand, Cand, LivenessReduce))
+ return TryCand.Reason != NoCand;
+
+ // Don't extend the scheduled latency.
+ if (ShouldReduceLatency && TryCand.SU->getHeight() != Cand.SU->getHeight() &&
+ (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
+ Zone->getScheduledLatency())) {
+ unsigned HigherSUDepth = TryCand.SU->getHeight() < Cand.SU->getHeight() ?
+ Cand.SU->getDepth() : TryCand.SU->getDepth();
+ if (HigherSUDepth != getRemLat(Zone) &&
+ tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) {
+ return TryCand.Reason != NoCand;
+ }
+ }
+ }
+
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(TryCand.SU->WeakSuccsLeft, Cand.SU->WeakSuccsLeft,
+ TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ return false;
+}
+
+void SystemZPreRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
+ // Keep track of live regs instead of using the generic reg pressure tracking.
+ RegionPolicy.ShouldTrackPressure = false;
+ // These heuristics has so far seemed to work better without adding a
+ // top-down boundary.
+ RegionPolicy.OnlyBottomUp = true;
+}
+
+void SystemZPreRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+ GenericScheduler::initialize(dag);
+
+ TinyRegion = DAG->SUnits.size() <= TinyRegionLim;
+ const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(DAG->TII);
+ if (TinyRegion) {
+ // A tiny region with long latency instructions is better handled using
+ // normal heuristics, except in regions that have COPYs of a physreg both
+ // ways and/or have a compare-0 likely to be eliminated.
+ const SUnit *CmpZeroSU = nullptr;
+ const SUnit *CmpSrcSU = nullptr;
+ Register CmpSrcReg = 0;
+ bool OtherCCClob = false;
+ unsigned MaxLat = 0;
+ std::set<Register> PRegs;
+ bool CopysPRegDep = false;
+ for (unsigned Idx = DAG->SUnits.size() - 1; Idx + 1 != 0; --Idx) {
+ const SUnit *SU = &DAG->SUnits[Idx];
+ const MachineInstr *MI = SU->getInstr();
+
+ // Check for a (likely) eliminable compare-0.
+ if (TII->isCompareZero(*MI)) {
+ CmpZeroSU = SU;
+ CmpSrcReg = TII->getCompareSourceReg(*MI);
+ continue;
+ }
+ if (MI->getNumOperands()) {
+ const MachineOperand &DefMO = MI->getOperand(0);
+ // Doing this instead of SU data preds happens to also handle the
+ // case where CmpSrcReg is redefined.
+ if (isVirtRegDef(DefMO) && DefMO.getReg() == CmpSrcReg &&
+ MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
+ CmpSrcSU = SU;
+ }
+ if (SU != CmpZeroSU && SU != CmpSrcSU &&
+ MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
+ OtherCCClob = true;
+
+ // Check for long latency instructions.
+ MaxLat = std::max(MaxLat, unsigned(SU->Latency));
+
+ // Check for COPYs of pregs both in and out of the region.
+ if (MI->isCopy()) {
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
+ if (DstReg.isPhysical() && DAG->MRI.isAllocatable(DstReg) &&
+ SrcReg.isVirtual())
+ PRegs.insert(DstReg);
+ else if (SrcReg.isPhysical() && DAG->MRI.isAllocatable(SrcReg) &&
+ DstReg.isVirtual()) {
+ if (!PRegs.insert(SrcReg).second)
+ CopysPRegDep = true;
+ }
+ }
+ }
+ bool CmpElimRegion = CmpZeroSU && CmpSrcSU && OtherCCClob;
+
+ if (DAG->SUnits.size() > 6 && MaxLat >= 6 && !CopysPRegDep &&
+ !CmpElimRegion)
+ TinyRegion = false;
+ }
+ LLVM_DEBUG(dbgs() << "Region is" << (TinyRegion ? "" : " not") << " tiny.\n");
+ if (TinyRegion)
+ return;
+
+ NumLeft = DAG->SUnits.size();
+ RemLat = ~0U;
+
+ // It seems to work best to include the latencies in this heuristic (as
+ // opposed to something like a "unit SU height" with all latencies counted
+ // as 1).
+ unsigned DAGHeight = 0;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx)
+ DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
+ ShouldReduceLatency = DAG->SUnits.size() < 3 * std::max(DAGHeight, 1u);
+ LLVM_DEBUG(if (ShouldReduceLatency) dbgs() << "Latency scheduling enabled.\n";
+ else dbgs() << "Latency scheduling disabled.\n";);
+
+ // Find the registers that are live at the bottom, before scheduling.
+ LiveRegs.clear();
+ for (unsigned I = 0, E = DAG->MRI.getNumVirtRegs(); I != E; ++I) {
----------------
arsenm wrote:
This looks heavy, can you just look at operations in the region?
https://github.com/llvm/llvm-project/pull/135076
More information about the llvm-commits
mailing list