<div dir="ltr"><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">+/// Compute the max cyclic critical path through the DAG. For loops that span</span><br style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">
<span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">+/// basic blocks, MachineTraceMetrics should be used for this instead.</span><br style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">
<span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">+unsigned ScheduleDAGInstrs::</span><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">computeCyclicCriticalPath() {</span><br>
<div><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px"><br></span></div><div><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">This seems to suggest that MachineTraceMetrics provides a superset of the functionality that this routine provides (>1BB vs 1BB). What is the rationale for having the routine then?</span></div>
<div><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px"><br></span></div><div><span style="color:rgb(0,0,0);font-family:arial,sans-serif;font-size:13px">-- Sean Silva</span></div></div><div class="gmail_extra">
<br><br><div class="gmail_quote">On Fri, Aug 23, 2013 at 1:48 PM, Andrew Trick <span dir="ltr"><<a href="mailto:atrick@apple.com" target="_blank">atrick@apple.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
Author: atrick<br>
Date: Fri Aug 23 12:48:43 2013<br>
New Revision: 189120<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=189120&view=rev" target="_blank">http://llvm.org/viewvc/llvm-project?rev=189120&view=rev</a><br>
Log:<br>
Adds cyclic critical path computation and heuristics, temporarily disabled.<br>
<br>
Estimate the cyclic critical path within a single block loop. If the<br>
acyclic critical path is longer, then the loop will exhaust OOO<br>
resources after some number of iterations. If lag between the acyclic<br>
critical path and cyclic critical path is longer the the time it takes<br>
to issue those loop iterations, then aggressively schedule for<br>
latency.<br>
<br>
Modified:<br>
llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h<br>
llvm/trunk/lib/CodeGen/MachineScheduler.cpp<br>
llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp<br>
<br>
Modified: llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h?rev=189120&r1=189119&r2=189120&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h?rev=189120&r1=189119&r2=189120&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h (original)<br>
+++ llvm/trunk/include/llvm/CodeGen/ScheduleDAGInstrs.h Fri Aug 23 12:48:43 2013<br>
@@ -197,6 +197,9 @@ namespace llvm {<br>
/// input.<br>
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);<br>
<br>
+ /// Compute the cyclic critical path through the DAG.<br>
+ unsigned computeCyclicCriticalPath();<br>
+<br>
/// addSchedBarrierDeps - Add dependencies from instructions in the current<br>
/// list of instructions being scheduled to scheduling barrier. We want to<br>
/// make sure instructions which define registers that are either used by<br>
<br>
Modified: llvm/trunk/lib/CodeGen/MachineScheduler.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=189120&r1=189119&r2=189120&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=189120&r1=189119&r2=189120&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/CodeGen/MachineScheduler.cpp (original)<br>
+++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp Fri Aug 23 12:48:43 2013<br>
@@ -53,6 +53,9 @@ static cl::opt<unsigned> MISchedCutoff("<br>
static bool ViewMISchedDAGs = false;<br>
#endif // NDEBUG<br>
<br>
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,<br>
+ cl::desc("Enable cyclic critical path analysis."), cl::init(false));<br>
+<br>
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,<br>
cl::desc("Enable load clustering."), cl::init(true));<br>
<br>
@@ -1207,16 +1210,21 @@ public:<br>
struct SchedRemainder {<br>
// Critical path through the DAG in expected latency.<br>
unsigned CriticalPath;<br>
+ unsigned CyclicCritPath;<br>
<br>
// Scaled count of micro-ops left to schedule.<br>
unsigned RemIssueCount;<br>
<br>
+ bool IsAcyclicLatencyLimited;<br>
+<br>
// Unscheduled resources<br>
SmallVector<unsigned, 16> RemainingCounts;<br>
<br>
void reset() {<br>
CriticalPath = 0;<br>
+ CyclicCritPath = 0;<br>
RemIssueCount = 0;<br>
+ IsAcyclicLatencyLimited = false;<br>
RemainingCounts.clear();<br>
}<br>
<br>
@@ -1434,6 +1442,8 @@ public:<br>
virtual void registerRoots();<br>
<br>
protected:<br>
+ void checkAcyclicLatency();<br>
+<br>
void tryCandidate(SchedCandidate &Cand,<br>
SchedCandidate &TryCand,<br>
SchedBoundary &Zone,<br>
@@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomN<br>
Bot.releaseNode(SU, SU->BotReadyCycle);<br>
}<br>
<br>
+void ConvergingScheduler::checkAcyclicLatency() {<br>
+ if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)<br>
+ return;<br>
+<br>
+ unsigned BufferLimit =<br>
+ SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();<br>
+ unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;<br>
+ Rem.IsAcyclicLatencyLimited =<br>
+ (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;<br>
+<br>
+ DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "<br>
+ << Rem.RemIssueCount << "u = "<br>
+ << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. "<br>
+ << "Latency = " << LatencyLag << "c = "<br>
+ << LatencyLag * SchedModel->getLatencyFactor() << "u\n";<br>
+ if (Rem.IsAcyclicLatencyLimited)<br>
+ dbgs() << " ACYCLIC LATENCY LIMIT\n");<br>
+}<br>
+<br>
void ConvergingScheduler::registerRoots() {<br>
Rem.CriticalPath = DAG->ExitSU.getDepth();<br>
+<br>
+ if (EnableCyclicPath) {<br>
+ Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();<br>
+ checkAcyclicLatency();<br>
+ }<br>
// Some roots may not feed into ExitSU. Check all of them in case.<br>
for (std::vector<SUnit*>::const_iterator<br>
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {<br>
@@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *<br>
return 0;<br>
}<br>
<br>
+static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,<br>
+ ConvergingScheduler::SchedCandidate &Cand,<br>
+ ConvergingScheduler::SchedBoundary &Zone) {<br>
+ if (Zone.isTop()) {<br>
+ if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {<br>
+ if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),<br>
+ TryCand, Cand, ConvergingScheduler::TopDepthReduce))<br>
+ return true;<br>
+ }<br>
+ if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),<br>
+ TryCand, Cand, ConvergingScheduler::TopPathReduce))<br>
+ return true;<br>
+ }<br>
+ else {<br>
+ if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {<br>
+ if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),<br>
+ TryCand, Cand, ConvergingScheduler::BotHeightReduce))<br>
+ return true;<br>
+ }<br>
+ if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),<br>
+ TryCand, Cand, ConvergingScheduler::BotPathReduce))<br>
+ return true;<br>
+ }<br>
+ return false;<br>
+}<br>
+<br>
/// Apply a set of heursitics to a new candidate. Heuristics are currently<br>
/// hierarchical. This may be more efficient than a graduated cost model because<br>
/// we don't need to evaluate all aspects of the model for each node in the<br>
@@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(S<br>
RegExcess))<br>
return;<br>
<br>
+ // For loops that are acyclic path limited, aggressively schedule for latency.<br>
+ if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone))<br>
+ return;<br>
+<br>
// Avoid increasing the max critical pressure in the scheduled region.<br>
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,<br>
TryCand, Cand, RegCritical))<br>
@@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(S<br>
return;<br>
<br>
// Avoid serializing long latency dependence chains.<br>
- if (Cand.Policy.ReduceLatency) {<br>
- if (Zone.isTop()) {<br>
- if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {<br>
- if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),<br>
- TryCand, Cand, TopDepthReduce))<br>
- return;<br>
- }<br>
- if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),<br>
- TryCand, Cand, TopPathReduce))<br>
- return;<br>
- }<br>
- else {<br>
- if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {<br>
- if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),<br>
- TryCand, Cand, BotHeightReduce))<br>
- return;<br>
- }<br>
- if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),<br>
- TryCand, Cand, BotPathReduce))<br>
- return;<br>
- }<br>
+ // For acyclic path limited loops, latency was already checked above.<br>
+ if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited<br>
+ && tryLatency(TryCand, Cand, Zone)) {<br>
+ return;<br>
}<br>
<br>
// Prefer immediate defs/users of the last scheduled instruction. This is a<br>
<br>
Modified: llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp?rev=189120&r1=189119&r2=189120&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp?rev=189120&r1=189119&r2=189120&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp (original)<br>
+++ llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp Fri Aug 23 12:48:43 2013<br>
@@ -36,6 +36,8 @@<br>
#include "llvm/Target/TargetMachine.h"<br>
#include "llvm/Target/TargetRegisterInfo.h"<br>
#include "llvm/Target/TargetSubtargetInfo.h"<br>
+#include <queue><br>
+<br>
using namespace llvm;<br>
<br>
static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,<br>
@@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(<br>
PendingLoads.clear();<br>
}<br>
<br>
+/// Compute the max cyclic critical path through the DAG. For loops that span<br>
+/// basic blocks, MachineTraceMetrics should be used for this instead.<br>
+unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {<br>
+ // This only applies to single block loop.<br>
+ if (!BB->isSuccessor(BB))<br>
+ return 0;<br>
+<br>
+ unsigned MaxCyclicLatency = 0;<br>
+ // Visit each live out vreg def to find def/use pairs that cross iterations.<br>
+ for (SUnit::const_pred_iterator<br>
+ PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) {<br>
+ MachineInstr *MI = PI->getSUnit()->getInstr();<br>
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {<br>
+ const MachineOperand &MO = MI->getOperand(i);<br>
+ if (!MO.isReg() || !MO.isDef())<br>
+ break;<br>
+ unsigned Reg = MO.getReg();<br>
+ if (!Reg || TRI->isPhysicalRegister(Reg))<br>
+ continue;<br>
+<br>
+ const LiveInterval &LI = LIS->getInterval(Reg);<br>
+ unsigned LiveOutHeight = PI->getSUnit()->getHeight();<br>
+ unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency();<br>
+ // Visit all local users of the vreg def.<br>
+ for (VReg2UseMap::iterator<br>
+ UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {<br>
+ if (UI->SU == &ExitSU)<br>
+ continue;<br>
+<br>
+ // Only consider uses of the phi.<br>
+ LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));<br>
+ if (!LRQ.valueIn()->isPHIDef())<br>
+ continue;<br>
+<br>
+ // Cheat a bit and assume that a path spanning two iterations is a<br>
+ // cycle, which could overestimate in strange cases. This allows cyclic<br>
+ // latency to be estimated as the minimum height or depth slack.<br>
+ unsigned CyclicLatency = 0;<br>
+ if (LiveOutDepth > UI->SU->getDepth())<br>
+ CyclicLatency = LiveOutDepth - UI->SU->getDepth();<br>
+ unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency();<br>
+ if (LiveInHeight > LiveOutHeight) {<br>
+ if (LiveInHeight - LiveOutHeight < CyclicLatency)<br>
+ CyclicLatency = LiveInHeight - LiveOutHeight;<br>
+ }<br>
+ else<br>
+ CyclicLatency = 0;<br>
+ DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum<br>
+ << ") -> SU(" << UI->SU->NodeNum << ") = "<br>
+ << CyclicLatency << "\n");<br>
+ if (CyclicLatency > MaxCyclicLatency)<br>
+ MaxCyclicLatency = CyclicLatency;<br>
+ }<br>
+ }<br>
+ }<br>
+ DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n");<br>
+ return MaxCyclicLatency;<br>
+}<br>
+<br>
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {<br>
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)<br>
SU->getInstr()->dump();<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@cs.uiuc.edu">llvm-commits@cs.uiuc.edu</a><br>
<a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits" target="_blank">http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits</a><br>
</blockquote></div><br></div>