<html><head><meta http-equiv="Content-Type" content="text/html charset=us-ascii"></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">On Apr 2, 2013, at 10:49 AM, Jakob Stoklund Olesen <<a href="mailto:stoklund@2pi.dk">stoklund@2pi.dk</a>> wrote:<br><div><br class="Apple-interchange-newline"><blockquote type="cite"><div style="letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: auto; word-spacing: 0px; -webkit-text-stroke-width: 0px;">Author: stoklund<br>Date: Tue Apr  2 12:49:51 2013<br>New Revision: 178553<br><br>URL:<span class="Apple-converted-space"> </span><a href="http://llvm.org/viewvc/llvm-project?rev=178553&view=rev">http://llvm.org/viewvc/llvm-project?rev=178553&view=rev</a><br>Log:<br>Count processor resources individually in MachineTraceMetrics.<br><br>The new instruction scheduling models provide information about the<br>number of cycles consumed on each processor resource. This makes it<br>possible to estimate ILP more accurately than simply counting<br>instructions / issue width.<br><br>The functions getResourceDepth() and getResourceLength() now identify<br>the limiting processor resource, and return a cycle count based on that.<br><br>This gives more precise resource information, particularly in traces<br>that use one resource a lot more than others.<br><br>Modified:<br>   llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h<br>   llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp<br><br>Modified: llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h<br>URL:<span class="Apple-converted-space"> </span><a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h?rev=178553&r1=178552&r2=178553&view=diff">http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h?rev=178553&r1=178552&r2=178553&view=diff</a><br>==============================================================================<br>--- llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h (original)<br>+++ llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h Tue Apr  2 12:49:51 2013<br>@@ -107,6 +107,13 @@ public:<br>  /// Get the fixed resource information about MBB. Compute it on demand.<br>  const FixedBlockInfo *getResources(const MachineBasicBlock*);<br><br>+  /// Get the scaled number of cycles used per processor resource in MBB.<br>+  /// This is an array with SchedModel.getNumProcResourceKinds() entries.<br>+  /// The getResources() function above must have been called first.<br>+  ///<br>+  /// These numbers have already been scaled by SchedModel.getResourceFactor().<br>+  ArrayRef<unsigned> getProcResourceCycles(unsigned MBBNum) const;<br>+<br>  /// A virtual register or regunit required by a basic block or its trace<br>  /// successors.<br>  struct LiveInReg {<br>@@ -284,6 +291,8 @@ public:<br>  class Ensemble {<br>    SmallVector<TraceBlockInfo, 4> BlockInfo;<br>    DenseMap<const MachineInstr*, InstrCycles> Cycles;<br>+    SmallVector<unsigned, 0> ProcResourceDepths;<br>+    SmallVector<unsigned, 0> ProcResourceHeights;<br>    friend class Trace;<br><br>    void computeTrace(const MachineBasicBlock*);<br>@@ -303,6 +312,8 @@ public:<br>    const MachineLoop *getLoopFor(const MachineBasicBlock*) const;<br>    const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const;<br>    const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const;<br>+    ArrayRef<unsigned> getProcResourceDepths(unsigned MBBNum) const;<br>+    ArrayRef<unsigned> getProcResourceHeights(unsigned MBBNum) const;<br><br>  public:<br>    virtual ~Ensemble();<br>@@ -343,8 +354,22 @@ private:<br>  // One entry per basic block, indexed by block number.<br>  SmallVector<FixedBlockInfo, 4> BlockInfo;<br><br>+  // Cycles consumed on each processor resource per block.<br>+  // The number of processor resource kinds is constant for a given subtarget,<br>+  // but it is not known at compile time. The number of cycles consumed by<br>+  // block B on processor resource R is at ProcResourceCycles[B*Kinds + R]<br>+  // where Kinds = SchedModel.getNumProcResourceKinds().<br>+  SmallVector<unsigned, 0> ProcResourceCycles;<br>+<br>  // One ensemble per strategy.<br>  Ensemble* Ensembles[TS_NumStrategies];<br>+<br>+  // Convert scaled resource usage to a cycle count that can be compared with<br>+  // latencies.<br>+  unsigned getCycles(unsigned Scaled) {<br>+    unsigned Factor = SchedModel.getLatencyFactor();<br>+    return (Scaled + Factor - 1) / Factor;<br>+  }<br>};<br><br>inline raw_ostream &operator<<(raw_ostream &OS,<br><br>Modified: llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp<br>URL:<span class="Apple-converted-space"> </span><a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp?rev=178553&r1=178552&r2=178553&view=diff">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp?rev=178553&r1=178552&r2=178553&view=diff</a><br>==============================================================================<br>--- llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp (original)<br>+++ llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp Tue Apr  2 12:49:51 2013<br>@@ -18,6 +18,7 @@<br>#include "llvm/CodeGen/Passes.h"<br>#include "llvm/MC/MCSubtargetInfo.h"<br>#include "llvm/Support/Debug.h"<br>+#include "llvm/Support/Format.h"<br>#include "llvm/Support/raw_ostream.h"<br>#include "llvm/Target/TargetInstrInfo.h"<br>#include "llvm/Target/TargetRegisterInfo.h"<br>@@ -57,6 +58,8 @@ bool MachineTraceMetrics::runOnMachineFu<br>    MF->getTarget().getSubtarget<TargetSubtargetInfo>();<br>  SchedModel.init(*ST.getSchedModel(), &ST, TII);<br>  BlockInfo.resize(MF->getNumBlockIDs());<br>+  ProcResourceCycles.resize(MF->getNumBlockIDs() *<br>+                            SchedModel.getNumProcResourceKinds());<br>  return false;<br>}<br><br>@@ -85,9 +88,13 @@ MachineTraceMetrics::getResources(const<br>    return FBI;<br><br>  // Compute resource usage in the block.<br>-  // FIXME: Compute per-functional unit counts.<br>  FBI->HasCalls = false;<br>  unsigned InstrCount = 0;<br>+<br>+  // Add up per-processor resource cycles as well.<br>+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();<br>+  SmallVector<unsigned, 32> PRCycles(PRKinds);<br>+<br>  for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();<br>       I != E; ++I) {<br>    const MachineInstr *MI = I;<br>@@ -96,11 +103,39 @@ MachineTraceMetrics::getResources(const<br>    ++InstrCount;<br>    if (MI->isCall())<br>      FBI->HasCalls = true;<br>+<br>+    // Count processor resources used.<br>+    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);<br>+    if (!SC->isValid())<br>+      continue;<br>+<br>+    for (TargetSchedModel::ProcResIter<br>+         PI = SchedModel.getWriteProcResBegin(SC),<br>+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {<br>+      assert(PI->ProcResourceIdx < PRKinds && "Bad processor resource kind");<br>+      PRCycles[PI->ProcResourceIdx] += PI->Cycles;<br>+    }<br>  }<br>  FBI->InstrCount = InstrCount;<br>+<br>+  // Scale the resource cycles so they are comparable.<br>+  unsigned PROffset = MBB->getNumber() * PRKinds;<br>+  for (unsigned K = 0; K != PRKinds; ++K)<br>+    ProcResourceCycles[PROffset + K] =<br>+      PRCycles[K] * SchedModel.getResourceFactor(K);<br>+<br>  return FBI;<br>}<br><br>+ArrayRef<unsigned><br>+MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {<br>+  assert(BlockInfo[MBBNum].hasResources() &&<br>+         "getResources() must be called before getProcResourceCycles()");<br>+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();<br>+  return ArrayRef<unsigned>(&ProcResourceCycles[MBBNum * PRKinds], PRKinds);<br>+}<br>+<br>+<br>//===----------------------------------------------------------------------===//<br>//                         Ensemble utility functions<br>//===----------------------------------------------------------------------===//<br>@@ -108,6 +143,9 @@ MachineTraceMetrics::getResources(const<br>MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)<br>  : MTM(*ct) {<br>  BlockInfo.resize(MTM.BlockInfo.size());<br>+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();<br>+  ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);<br>+  ProcResourceHeights.resize(MTM.BlockInfo.size() * PRKinds);<br>}<br><br>// Virtual destructor serves as an anchor.<br>@@ -123,21 +161,32 @@ MachineTraceMetrics::Ensemble::getLoopFo<br>void MachineTraceMetrics::Ensemble::<br>computeDepthResources(const MachineBasicBlock *MBB) {<br>  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];<br>+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();<br>+  unsigned PROffset = MBB->getNumber() * PRKinds;<br><br>  // Compute resources from trace above. The top block is simple.<br>  if (!TBI->Pred) {<br>    TBI->InstrDepth = 0;<br>    TBI->Head = MBB->getNumber();<br>+    std::fill(ProcResourceDepths.begin() + PROffset,<br>+              ProcResourceDepths.begin() + PROffset + PRKinds, 0);<br>    return;<br>  }<br><br>  // Compute from the block above. A post-order traversal ensures the<br>  // predecessor is always computed first.<br>-  TraceBlockInfo *PredTBI = &BlockInfo[TBI->Pred->getNumber()];<br>+  unsigned PredNum = TBI->Pred->getNumber();<br>+  TraceBlockInfo *PredTBI = &BlockInfo[PredNum];<br>  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");<br>  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);<br>  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;<br>  TBI->Head = PredTBI->Head;<br>+<br>+  // Compute per-resource depths.<br>+  ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);<br>+  ArrayRef<unsigned> PredPRCycles = MTM.getProcResourceCycles(PredNum);<br>+  for (unsigned K = 0; K != PRKinds; ++K)<br>+    ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];<br>}<br><br>// Update resource-related information in the TraceBlockInfo for MBB.<br>@@ -145,22 +194,33 @@ computeDepthResources(const MachineBasic<br>void MachineTraceMetrics::Ensemble::<br>computeHeightResources(const MachineBasicBlock *MBB) {<br>  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];<br>+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();<br>+  unsigned PROffset = MBB->getNumber() * PRKinds;<br><br>  // Compute resources for the current block.<br>  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;<br>+  ArrayRef<unsigned> PRCycles = MTM.getProcResourceCycles(MBB->getNumber());<br><br>  // The trace tail is done.<br>  if (!TBI->Succ) {<br>    TBI->Tail = MBB->getNumber();<br>+    std::copy(PRCycles.begin(), PRCycles.end(),<br>+              ProcResourceHeights.begin() + PROffset);<br>    return;<br>  }<br><br>  // Compute from the block below. A post-order traversal ensures the<br>  // predecessor is always computed first.<br>-  TraceBlockInfo *SuccTBI = &BlockInfo[TBI->Succ->getNumber()];<br>+  unsigned SuccNum = TBI->Succ->getNumber();<br>+  TraceBlockInfo *SuccTBI = &BlockInfo[SuccNum];<br>  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");<br>  TBI->InstrHeight += SuccTBI->InstrHeight;<br>  TBI->Tail = SuccTBI->Tail;<br>+<br>+  // Compute per-resource heights.<br>+  ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);<br>+  for (unsigned K = 0; K != PRKinds; ++K)<br>+    ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];<br>}<br><br>// Check if depth resources for MBB are valid and return the TBI.<br>@@ -181,6 +241,31 @@ getHeightResources(const MachineBasicBlo<br>  return TBI->hasValidHeight() ? TBI : 0;<br>}<br><br>+/// Get an array of processor resource depths for MBB. Indexed by processor<br>+/// resource kind, this array contains the scaled processor resources consumed<br>+/// by all blocks preceding MBB in its trace. It does not include instructions<br>+/// in MBB.<br>+///<br>+/// Compare TraceBlockInfo::InstrDepth.<br>+ArrayRef<unsigned><br>+MachineTraceMetrics::Ensemble::<br>+getProcResourceDepths(unsigned MBBNum) const {<br>+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();<br>+  return ArrayRef<unsigned>(&ProcResourceDepths[MBBNum * PRKinds], PRKinds);<br>+}<br>+<br>+/// Get an array of processor resource heights for MBB. Indexed by processor<br>+/// resource kind, this array contains the scaled processor resources consumed<br>+/// by this block and all blocks following it in its trace.<br>+///<br>+/// Compare TraceBlockInfo::InstrHeight.<br>+ArrayRef<unsigned><br>+MachineTraceMetrics::Ensemble::<br>+getProcResourceHeights(unsigned MBBNum) const {<br>+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();<br>+  return ArrayRef<unsigned>(&ProcResourceHeights[MBBNum * PRKinds], PRKinds);<br>+}<br>+<br>//===----------------------------------------------------------------------===//<br>//                         Trace Selection Strategies<br>//===----------------------------------------------------------------------===//<br>@@ -713,11 +798,24 @@ computeInstrDepths(const MachineBasicBlo<br>  SmallVector<DataDep, 8> Deps;<br>  while (!Stack.empty()) {<br>    MBB = Stack.pop_back_val();<br>-    DEBUG(dbgs() << "Depths for BB#" << MBB->getNumber() << ":\n");<br>+    DEBUG(dbgs() << "\nDepths for BB#" << MBB->getNumber() << ":\n");<br>    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];<br>    TBI.HasValidInstrDepths = true;<br>    TBI.CriticalPath = 0;<br><br>+    // Print out resource depths here as well.<br>+    DEBUG({<br>+      dbgs() << format("%7u Instructions\n", TBI.InstrDepth);<br>+      ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());<br>+      for (unsigned K = 0; K != PRDepths.size(); ++K)<br>+        if (PRDepths[K]) {<br>+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);<br>+          dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))<br>+                 << MTM.SchedModel.getProcResource(K)->Name << " ("<br>+                 << PRDepths[K]/Factor << " ops x" << Factor << ")\n";<br>+        }<br>+    });<br>+<br>    // Also compute the critical path length through MBB when possible.<br>    if (TBI.HasValidInstrHeights)<br>      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);<br>@@ -928,6 +1026,18 @@ computeInstrHeights(const MachineBasicBl<br>    TBI.HasValidInstrHeights = true;<br>    TBI.CriticalPath = 0;<br><br>+    DEBUG({<br>+      dbgs() << format("%7u Instructions\n", TBI.InstrHeight);<br>+      ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());<br>+      for (unsigned K = 0; K != PRHeights.size(); ++K)<br>+        if (PRHeights[K]) {<br>+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);<br>+          dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))<br>+                 << MTM.SchedModel.getProcResource(K)->Name << " ("<br>+                 << PRHeights[K]/Factor << " ops x" << Factor << ")\n";<br>+        }<br>+    });<br>+<br>    // Get dependencies from PHIs in the trace successor.<br>    const MachineBasicBlock *Succ = TBI.Succ;<br>    // If MBB is the last block in the trace, and it has a back-edge to the<br>@@ -1058,27 +1168,52 @@ MachineTraceMetrics::Trace::getPHIDepth(<br>}<br><br>unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {<br>-  // For now, we compute the resource depth from instruction count / issue<br>-  // width. Eventually, we should compute resource depth per functional unit<br>-  // and return the max.<br>+  // Find the limiting processor resource.<br>+  // Numbers have been pre-scaled to be comparable.<br>+  unsigned PRMax = 0;<br>+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());<br>+  if (Bottom) {<br>+    ArrayRef<unsigned> PRCycles = TE.MTM.getProcResourceCycles(getBlockNum());<br>+    for (unsigned K = 0; K != PRDepths.size(); ++K)<br>+      PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);<br>+  } else {<br>+    for (unsigned K = 0; K != PRDepths.size(); ++K)<br>+      PRMax = std::max(PRMax, PRDepths[K]);<br>+  }<br>+  // Convert to cycle count.<br>+  PRMax = TE.MTM.getCycles(PRMax);<br>+<br>  unsigned Instrs = TBI.InstrDepth;<br>  if (Bottom)<br>    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;<br>  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())<br>    Instrs /= IW;<br>  // Assume issue width 1 without a schedule model.<br>-  return Instrs;<br>+  return std::max(Instrs, PRMax);<br>}<br><br>unsigned MachineTraceMetrics::Trace::<br>getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks) const {<br>+  // Add up resources above and below the center block.<br>+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());<br>+  ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());<br>+  unsigned PRMax = 0;<br>+  for (unsigned K = 0; K != PRDepths.size(); ++K) {<br>+    unsigned PRCycles = PRDepths[K] + PRHeights[K];<br>+    for (unsigned I = 0; I != Extrablocks.size(); ++I)<br>+      PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];<br>+    PRMax = std::max(PRMax, PRCycles);<br>+  }<br>+  // Convert to cycle count.<br>+  PRMax = TE.MTM.getCycles(PRMax);<br>+<br>  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;<br>  for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)<br>    Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;<br>  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())<br>    Instrs /= IW;<br>  // Assume issue width 1 without a schedule model.<br>-  return Instrs;<br>+  return std::max(Instrs, PRMax);<br>}<br></div></blockquote></div><br><div>It's not clear to me when you account for the resources in the current block. Is this missing a call to getResourceCycles(getBlockNum())?</div><div><br></div><div>-Andy</div><div><br></div></body></html>