[llvm] r189120 - Adds cyclic critical path computation and heuristics, temporarily disabled.

Fri Aug 23 11:50:25 PDT 2013

On Aug 23, 2013, at 11:18 AM, Hal Finkel <hfinkel at anl.gov> wrote:

> Andy,
> 
> Can you please provide a slightly more verbose explanation of what this is doing? The commit message seems to imply that inside a loop there could be both a cyclic and acyclic critical path, and these two are being compared. Is that right? If so, what is the acyclic critical path inside a loop?

That’s right. I meant to add more comments before this feature ended up in a commit branch. I’ll do that now.

Loop:
i = phi(c)
a = op
b = op(a)
c = op(b, i)

a->b->c is the acyclic critical path

c1->c2 is the cyclic critical path.

The difference is the “lag”. If the lag is longer than the time it takes to issue all the in-flight instructions, the OOO engine will saturate and stall waiting for long latency instructions to execute/retire and free resources. I came up with this formula quickly based on intuition. It would be interesting to observe this in a simulator and verify the logic.

-Andy

> Thanks again,
> Hal
> 
> ----- Original Message -----
>> Author: atrick
>> Date: Fri Aug 23 12:48:43 2013
>> New Revision: 189120
>> 
>> URL: http://llvm.org/viewvc/llvm-project?rev=189120&view=rev
>> Log:
>> Adds cyclic critical path computation and heuristics, temporarily
>> disabled.
>> 
>> Estimate the cyclic critical path within a single block loop. If the
>> acyclic critical path is longer, then the loop will exhaust OOO
>> resources after some number of iterations. If lag between the acyclic
>> critical path and cyclic critical path is longer the the time it
>> takes
>> to issue those loop iterations, then aggressively schedule for
>> latency.
>> 
>> Modified:
>>    llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h
>>    llvm/trunk/lib/CodeGen/MachineScheduler.cpp
>>    llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
>> 
>> Modified: llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h (original)
>> +++ llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h Fri Aug 23
>> 12:48:43 2013
>> @@ -197,6 +197,9 @@ namespace llvm {
>>     /// input.
>>     void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker
>>     *RPTracker = 0);
>> 
>> +    /// Compute the cyclic critical path through the DAG.
>> +    unsigned computeCyclicCriticalPath();
>> +
>>     /// addSchedBarrierDeps - Add dependencies from instructions in
>>     the current
>>     /// list of instructions being scheduled to scheduling barrier.
>>     We want to
>>     /// make sure instructions which define registers that are
>>     either used by
>> 
>> Modified: llvm/trunk/lib/CodeGen/MachineScheduler.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/MachineScheduler.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp Fri Aug 23 12:48:43
>> 2013
>> @@ -53,6 +53,9 @@ static cl::opt<unsigned> MISchedCutoff("
>> static bool ViewMISchedDAGs = false;
>> #endif // NDEBUG
>> 
>> +static cl::opt<bool> EnableCyclicPath("misched-cyclicpath",
>> cl::Hidden,
>> +  cl::desc("Enable cyclic critical path analysis."),
>> cl::init(false));
>> +
>> static cl::opt<bool> EnableLoadCluster("misched-cluster",
>> cl::Hidden,
>>   cl::desc("Enable load clustering."), cl::init(true));
>> 
>> @@ -1207,16 +1210,21 @@ public:
>>   struct SchedRemainder {
>>     // Critical path through the DAG in expected latency.
>>     unsigned CriticalPath;
>> +    unsigned CyclicCritPath;
>> 
>>     // Scaled count of micro-ops left to schedule.
>>     unsigned RemIssueCount;
>> 
>> +    bool IsAcyclicLatencyLimited;
>> +
>>     // Unscheduled resources
>>     SmallVector<unsigned, 16> RemainingCounts;
>> 
>>     void reset() {
>>       CriticalPath = 0;
>> +      CyclicCritPath = 0;
>>       RemIssueCount = 0;
>> +      IsAcyclicLatencyLimited = false;
>>       RemainingCounts.clear();
>>     }
>> 
>> @@ -1434,6 +1442,8 @@ public:
>>   virtual void registerRoots();
>> 
>> protected:
>> +  void checkAcyclicLatency();
>> +
>>   void tryCandidate(SchedCandidate &Cand,
>>                     SchedCandidate &TryCand,
>>                     SchedBoundary &Zone,
>> @@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomN
>>   Bot.releaseNode(SU, SU->BotReadyCycle);
>> }
>> 
>> +void ConvergingScheduler::checkAcyclicLatency() {
>> +  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >=
>> Rem.CriticalPath)
>> +    return;
>> +
>> +  unsigned BufferLimit =
>> +    SchedModel->getMicroOpBufferSize() *
>> SchedModel->getMicroOpFactor();
>> +  unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
>> +  Rem.IsAcyclicLatencyLimited =
>> +    (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
>> +
>> +  DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
>> +        << Rem.RemIssueCount << "u = "
>> +        << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount <<
>> " iters. "
>> +        << "Latency = " << LatencyLag << "c = "
>> +        << LatencyLag * SchedModel->getLatencyFactor() << "u\n";
>> +        if (Rem.IsAcyclicLatencyLimited)
>> +          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
>> +}
>> +
>> void ConvergingScheduler::registerRoots() {
>>   Rem.CriticalPath = DAG->ExitSU.getDepth();
>> +
>> +  if (EnableCyclicPath) {
>> +    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
>> +    checkAcyclicLatency();
>> +  }
>>   // Some roots may not feed into ExitSU. Check all of them in case.
>>   for (std::vector<SUnit*>::const_iterator
>>          I = Bot.Available.begin(), E = Bot.Available.end(); I != E;
>>          ++I) {
>> @@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *
>>   return 0;
>> }
>> 
>> +static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
>> +                       ConvergingScheduler::SchedCandidate &Cand,
>> +                       ConvergingScheduler::SchedBoundary &Zone) {
>> +  if (Zone.isTop()) {
>> +    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
>> +      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> +                  TryCand, Cand,
>> ConvergingScheduler::TopDepthReduce))
>> +        return true;
>> +    }
>> +    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> +                   TryCand, Cand,
>> ConvergingScheduler::TopPathReduce))
>> +      return true;
>> +  }
>> +  else {
>> +    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
>> +      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> +                  TryCand, Cand,
>> ConvergingScheduler::BotHeightReduce))
>> +        return true;
>> +    }
>> +    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> +                   TryCand, Cand,
>> ConvergingScheduler::BotPathReduce))
>> +      return true;
>> +  }
>> +  return false;
>> +}
>> +
>> /// Apply a set of heursitics to a new candidate. Heuristics are
>> currently
>> /// hierarchical. This may be more efficient than a graduated cost
>> model because
>> /// we don't need to evaluate all aspects of the model for each node
>> in the
>> @@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(S
>>                   RegExcess))
>>     return;
>> 
>> +  // For loops that are acyclic path limited, aggressively schedule
>> for latency.
>> +  if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand,
>> Zone))
>> +    return;
>> +
>>   // Avoid increasing the max critical pressure in the scheduled
>>   region.
>>   if (tryPressure(TryCand.RPDelta.CriticalMax,
>>   Cand.RPDelta.CriticalMax,
>>                   TryCand, Cand, RegCritical))
>> @@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(S
>>     return;
>> 
>>   // Avoid serializing long latency dependence chains.
>> -  if (Cand.Policy.ReduceLatency) {
>> -    if (Zone.isTop()) {
>> -      if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
>> -        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> -                    TryCand, Cand, TopDepthReduce))
>> -          return;
>> -      }
>> -      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> -                     TryCand, Cand, TopPathReduce))
>> -        return;
>> -    }
>> -    else {
>> -      if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
>> -        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
>> -                    TryCand, Cand, BotHeightReduce))
>> -          return;
>> -      }
>> -      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
>> -                     TryCand, Cand, BotPathReduce))
>> -        return;
>> -    }
>> +  // For acyclic path limited loops, latency was already checked
>> above.
>> +  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
>> +      && tryLatency(TryCand, Cand, Zone)) {
>> +    return;
>>   }
>> 
>>   // Prefer immediate defs/users of the last scheduled instruction.
>>   This is a
>> 
>> Modified: llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp?rev=189120&r1=189119&r2=189120&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp Fri Aug 23 12:48:43
>> 2013
>> @@ -36,6 +36,8 @@
>> #include "llvm/Target/TargetMachine.h"
>> #include "llvm/Target/TargetRegisterInfo.h"
>> #include "llvm/Target/TargetSubtargetInfo.h"
>> +#include <queue>
>> +
>> using namespace llvm;
>> 
>> static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi",
>> cl::Hidden,
>> @@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(
>>   PendingLoads.clear();
>> }
>> 
>> +/// Compute the max cyclic critical path through the DAG. For loops
>> that span
>> +/// basic blocks, MachineTraceMetrics should be used for this
>> instead.
>> +unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
>> +  // This only applies to single block loop.
>> +  if (!BB->isSuccessor(BB))
>> +    return 0;
>> +
>> +  unsigned MaxCyclicLatency = 0;
>> +  // Visit each live out vreg def to find def/use pairs that cross
>> iterations.
>> +  for (SUnit::const_pred_iterator
>> +         PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI !=
>> PE; ++PI) {
>> +    MachineInstr *MI = PI->getSUnit()->getInstr();
>> +    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
>> +      const MachineOperand &MO = MI->getOperand(i);
>> +      if (!MO.isReg() || !MO.isDef())
>> +        break;
>> +      unsigned Reg = MO.getReg();
>> +      if (!Reg || TRI->isPhysicalRegister(Reg))
>> +        continue;
>> +
>> +      const LiveInterval &LI = LIS->getInterval(Reg);
>> +      unsigned LiveOutHeight = PI->getSUnit()->getHeight();
>> +      unsigned LiveOutDepth = PI->getSUnit()->getDepth() +
>> PI->getLatency();
>> +      // Visit all local users of the vreg def.
>> +      for (VReg2UseMap::iterator
>> +             UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
>> +        if (UI->SU == &ExitSU)
>> +          continue;
>> +
>> +        // Only consider uses of the phi.
>> +        LiveRangeQuery LRQ(LI,
>> LIS->getInstructionIndex(UI->SU->getInstr()));
>> +        if (!LRQ.valueIn()->isPHIDef())
>> +          continue;
>> +
>> +        // Cheat a bit and assume that a path spanning two
>> iterations is a
>> +        // cycle, which could overestimate in strange cases. This
>> allows cyclic
>> +        // latency to be estimated as the minimum height or depth
>> slack.
>> +        unsigned CyclicLatency = 0;
>> +        if (LiveOutDepth > UI->SU->getDepth())
>> +          CyclicLatency = LiveOutDepth - UI->SU->getDepth();
>> +        unsigned LiveInHeight = UI->SU->getHeight() +
>> PI->getLatency();
>> +        if (LiveInHeight > LiveOutHeight) {
>> +          if (LiveInHeight - LiveOutHeight < CyclicLatency)
>> +            CyclicLatency = LiveInHeight - LiveOutHeight;
>> +        }
>> +        else
>> +          CyclicLatency = 0;
>> +        DEBUG(dbgs() << "Cyclic Path: SU(" <<
>> PI->getSUnit()->NodeNum
>> +              << ") -> SU(" << UI->SU->NodeNum << ") = "
>> +              << CyclicLatency << "\n");
>> +        if (CyclicLatency > MaxCyclicLatency)
>> +          MaxCyclicLatency = CyclicLatency;
>> +      }
>> +    }
>> +  }
>> +  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency <<
>> "\n");
>> +  return MaxCyclicLatency;
>> +}
>> +
>> void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
>> #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
>>   SU->getInstr()->dump();
>> 
>> 
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>> 
> 
> -- 
> Hal Finkel
> Assistant Computational Scientist
> Leadership Computing Facility
> Argonne National Laboratory