[llvm] r189120 - Adds cyclic critical path computation and heuristics, temporarily disabled.
Andrew Trick
atrick at apple.com
Fri Aug 23 11:50:25 PDT 2013
On Aug 23, 2013, at 11:18 AM, Hal Finkel <hfinkel at anl.gov> wrote:
> Andy,
>
> Can you please provide a slightly more verbose explanation of what this is doing? The commit message seems to imply that inside a loop there could be both a cyclic and acyclic critical path, and these two are being compared. Is that right? If so, what is the acyclic critical path inside a loop?
That’s right. I meant to add more comments before this feature ended up in a commit branch. I’ll do that now.
Loop:
i = phi(c)
a = op
b = op(a)
c = op(b, i)
a->b->c is the acyclic critical path
c1->c2 is the cyclic critical path.
The difference is the “lag”. If the lag is longer than the time it takes to issue all the in-flight instructions, the OOO engine will saturate and stall waiting for long latency instructions to execute/retire and free resources. I came up with this formula quickly based on intuition. It would be interesting to observe this in a simulator and verify the logic.
-Andy
> Thanks again,
> Hal
>
> ----- Original Message -----
>> Author: atrick
>> Date: Fri Aug 23 12:48:43 2013
>> New Revision: 189120
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=189120&view=rev
>> Log:
>> Adds cyclic critical path computation and heuristics, temporarily
>> disabled.
>>
>> Estimate the cyclic critical path within a single block loop. If the
>> acyclic critical path is longer, then the loop will exhaust OOO
>> resources after some number of iterations. If lag between the acyclic
>> critical path and cyclic critical path is longer the the time it
>> takes
>> to issue those loop iterations, then aggressively schedule for
>> latency.
>>
>> Modified:
>> llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h
>> llvm/trunk/lib/CodeGen/MachineScheduler.cpp
>> llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
>>
>> Modified: llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h (original)
>> +++ llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h Fri Aug 23
>> 12:48:43 2013
>> @@ -197,6 +197,9 @@ namespace llvm {
>> /// input.
>> void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker
>> *RPTracker = 0);
>>
>> + /// Compute the cyclic critical path through the DAG.
>> + unsigned computeCyclicCriticalPath();
>> +
>> /// addSchedBarrierDeps - Add dependencies from instructions in
>> the current
>> /// list of instructions being scheduled to scheduling barrier.
>> We want to
>> /// make sure instructions which define registers that are
>> either used by
>>
>> Modified: llvm/trunk/lib/CodeGen/MachineScheduler.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/MachineScheduler.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp Fri Aug 23 12:48:43
>> 2013
>> @@ -53,6 +53,9 @@ static cl::opt<unsigned> MISchedCutoff("
>> static bool ViewMISchedDAGs = false;
>> #endif // NDEBUG
>>
>> +static cl::opt<bool> EnableCyclicPath("misched-cyclicpath",
>> cl::Hidden,
>> + cl::desc("Enable cyclic critical path analysis."),
>> cl::init(false));
>> +
>> static cl::opt<bool> EnableLoadCluster("misched-cluster",
>> cl::Hidden,
>> cl::desc("Enable load clustering."), cl::init(true));
>>
>> @@ -1207,16 +1210,21 @@ public:
>> struct SchedRemainder {
>> // Critical path through the DAG in expected latency.
>> unsigned CriticalPath;
>> + unsigned CyclicCritPath;
>>
>> // Scaled count of micro-ops left to schedule.
>> unsigned RemIssueCount;
>>
>> + bool IsAcyclicLatencyLimited;
>> +
>> // Unscheduled resources
>> SmallVector<unsigned, 16> RemainingCounts;
>>
>> void reset() {
>> CriticalPath = 0;
>> + CyclicCritPath = 0;
>> RemIssueCount = 0;
>> + IsAcyclicLatencyLimited = false;
>> RemainingCounts.clear();
>> }
>>
>> @@ -1434,6 +1442,8 @@ public:
>> virtual void registerRoots();
>>
>> protected:
>> + void checkAcyclicLatency();
>> +
>> void tryCandidate(SchedCandidate &Cand,
>> SchedCandidate &TryCand,
>> SchedBoundary &Zone,
>> @@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomN
>> Bot.releaseNode(SU, SU->BotReadyCycle);
>> }
>>
>> +void ConvergingScheduler::checkAcyclicLatency() {
>> + if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >=
>> Rem.CriticalPath)
>> + return;
>> +
>> + unsigned BufferLimit =
>> + SchedModel->getMicroOpBufferSize() *
>> SchedModel->getMicroOpFactor();
>> + unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
>> + Rem.IsAcyclicLatencyLimited =
>> + (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
>> +
>> + DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
>> + << Rem.RemIssueCount << "u = "
>> + << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount <<
>> " iters. "
>> + << "Latency = " << LatencyLag << "c = "
>> + << LatencyLag * SchedModel->getLatencyFactor() << "u\n";
>> + if (Rem.IsAcyclicLatencyLimited)
>> + dbgs() << " ACYCLIC LATENCY LIMIT\n");
>> +}
>> +
>> void ConvergingScheduler::registerRoots() {
>> Rem.CriticalPath = DAG->ExitSU.getDepth();
>> +
>> + if (EnableCyclicPath) {
>> + Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
>> + checkAcyclicLatency();
>> + }
>> // Some roots may not feed into ExitSU. Check all of them in case.
>> for (std::vector<SUnit*>::const_iterator
>> I = Bot.Available.begin(), E = Bot.Available.end(); I != E;
>> ++I) {
>> @@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *
>> return 0;
>> }
>>
>> +static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
>> + ConvergingScheduler::SchedCandidate &Cand,
>> + ConvergingScheduler::SchedBoundary &Zone) {
>> + if (Zone.isTop()) {
>> + if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
>> + if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> + TryCand, Cand,
>> ConvergingScheduler::TopDepthReduce))
>> + return true;
>> + }
>> + if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> + TryCand, Cand,
>> ConvergingScheduler::TopPathReduce))
>> + return true;
>> + }
>> + else {
>> + if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
>> + if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> + TryCand, Cand,
>> ConvergingScheduler::BotHeightReduce))
>> + return true;
>> + }
>> + if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> + TryCand, Cand,
>> ConvergingScheduler::BotPathReduce))
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> /// Apply a set of heursitics to a new candidate. Heuristics are
>> currently
>> /// hierarchical. This may be more efficient than a graduated cost
>> model because
>> /// we don't need to evaluate all aspects of the model for each node
>> in the
>> @@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(S
>> RegExcess))
>> return;
>>
>> + // For loops that are acyclic path limited, aggressively schedule
>> for latency.
>> + if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand,
>> Zone))
>> + return;
>> +
>> // Avoid increasing the max critical pressure in the scheduled
>> region.
>> if (tryPressure(TryCand.RPDelta.CriticalMax,
>> Cand.RPDelta.CriticalMax,
>> TryCand, Cand, RegCritical))
>> @@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(S
>> return;
>>
>> // Avoid serializing long latency dependence chains.
>> - if (Cand.Policy.ReduceLatency) {
>> - if (Zone.isTop()) {
>> - if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
>> - if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> - TryCand, Cand, TopDepthReduce))
>> - return;
>> - }
>> - if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> - TryCand, Cand, TopPathReduce))
>> - return;
>> - }
>> - else {
>> - if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
>> - if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> - TryCand, Cand, BotHeightReduce))
>> - return;
>> - }
>> - if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> - TryCand, Cand, BotPathReduce))
>> - return;
>> - }
>> + // For acyclic path limited loops, latency was already checked
>> above.
>> + if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
>> + && tryLatency(TryCand, Cand, Zone)) {
>> + return;
>> }
>>
>> // Prefer immediate defs/users of the last scheduled instruction.
>> This is a
>>
>> Modified: llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp Fri Aug 23 12:48:43
>> 2013
>> @@ -36,6 +36,8 @@
>> #include "llvm/Target/TargetMachine.h"
>> #include "llvm/Target/TargetRegisterInfo.h"
>> #include "llvm/Target/TargetSubtargetInfo.h"
>> +#include <queue>
>> +
>> using namespace llvm;
>>
>> static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi",
>> cl::Hidden,
>> @@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(
>> PendingLoads.clear();
>> }
>>
>> +/// Compute the max cyclic critical path through the DAG. For loops
>> that span
>> +/// basic blocks, MachineTraceMetrics should be used for this
>> instead.
>> +unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
>> + // This only applies to single block loop.
>> + if (!BB->isSuccessor(BB))
>> + return 0;
>> +
>> + unsigned MaxCyclicLatency = 0;
>> + // Visit each live out vreg def to find def/use pairs that cross
>> iterations.
>> + for (SUnit::const_pred_iterator
>> + PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI !=
>> PE; ++PI) {
>> + MachineInstr *MI = PI->getSUnit()->getInstr();
>> + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
>> + const MachineOperand &MO = MI->getOperand(i);
>> + if (!MO.isReg() || !MO.isDef())
>> + break;
>> + unsigned Reg = MO.getReg();
>> + if (!Reg || TRI->isPhysicalRegister(Reg))
>> + continue;
>> +
>> + const LiveInterval &LI = LIS->getInterval(Reg);
>> + unsigned LiveOutHeight = PI->getSUnit()->getHeight();
>> + unsigned LiveOutDepth = PI->getSUnit()->getDepth() +
>> PI->getLatency();
>> + // Visit all local users of the vreg def.
>> + for (VReg2UseMap::iterator
>> + UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
>> + if (UI->SU == &ExitSU)
>> + continue;
>> +
>> + // Only consider uses of the phi.
>> + LiveRangeQuery LRQ(LI,
>> LIS->getInstructionIndex(UI->SU->getInstr()));
>> + if (!LRQ.valueIn()->isPHIDef())
>> + continue;
>> +
>> + // Cheat a bit and assume that a path spanning two
>> iterations is a
>> + // cycle, which could overestimate in strange cases. This
>> allows cyclic
>> + // latency to be estimated as the minimum height or depth
>> slack.
>> + unsigned CyclicLatency = 0;
>> + if (LiveOutDepth > UI->SU->getDepth())
>> + CyclicLatency = LiveOutDepth - UI->SU->getDepth();
>> + unsigned LiveInHeight = UI->SU->getHeight() +
>> PI->getLatency();
>> + if (LiveInHeight > LiveOutHeight) {
>> + if (LiveInHeight - LiveOutHeight < CyclicLatency)
>> + CyclicLatency = LiveInHeight - LiveOutHeight;
>> + }
>> + else
>> + CyclicLatency = 0;
>> + DEBUG(dbgs() << "Cyclic Path: SU(" <<
>> PI->getSUnit()->NodeNum
>> + << ") -> SU(" << UI->SU->NodeNum << ") = "
>> + << CyclicLatency << "\n");
>> + if (CyclicLatency > MaxCyclicLatency)
>> + MaxCyclicLatency = CyclicLatency;
>> + }
>> + }
>> + }
>> + DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency <<
>> "\n");
>> + return MaxCyclicLatency;
>> +}
>> +
>> void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
>> #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
>> SU->getInstr()->dump();
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
>
> --
> Hal Finkel
> Assistant Computational Scientist
> Leadership Computing Facility
> Argonne National Laboratory
More information about the llvm-commits
mailing list