[llvm] r372337 - [MCA] Improved cost computation for loop carried dependencies in the bottleneck analysis.

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 19 09:05:11 PDT 2019


Author: adibiagio
Date: Thu Sep 19 09:05:11 2019
New Revision: 372337

URL: http://llvm.org/viewvc/llvm-project?rev=372337&view=rev
Log:
[MCA] Improved cost computation for loop carried dependencies in the bottleneck analysis.

This patch introduces a cut-off threshold for dependency edge frequences with
the goal of simplifying the critical sequence computation.  This patch also
removes the cost normalization for loop carried dependencies.  We didn't really
need to artificially amplify the cost of loop-carried dependencies since it is
already computed as the integral over time of the delay (in cycle).

In the absence of backend stalls there is no need for computing a critical
sequence. With this patch we early exit from the critical sequence computation
if no bottleneck was reported during the simulation.


Modified:
    llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
    llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp
    llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h

Modified: llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s?rev=372337&r1=372336&r2=372337&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s Thu Sep 19 09:05:11 2019
@@ -67,18 +67,14 @@
 # CHECK-NEXT:  |      11.   vmovaps	%xmm7, %xmm4
 # CHECK-NEXT:  +----> 12.   vfmadd213ps	%xmm1, %xmm14, %xmm2      ## REGISTER dependency:  %xmm1
 # CHECK-NEXT:  |      13.   vmovaps	%xmm6, %xmm1
-# CHECK-NEXT:  |      14.   vfmadd213ps	%xmm2, %xmm15, %xmm3
-# CHECK-NEXT:  |      15.   vpermilps	$170, %xmm3, %xmm0
+# CHECK-NEXT:  +----> 14.   vfmadd213ps	%xmm2, %xmm15, %xmm3      ## REGISTER dependency:  %xmm2
+# CHECK-NEXT:  +----> 15.   vpermilps	$170, %xmm3, %xmm0                ## REGISTER dependency:  %xmm3
 # CHECK-NEXT:  |      16.   vmovups	%xmm3, (%rdx,%rax)
 # CHECK-NEXT:  |      17.   vpermilps	$255, %xmm3, %xmm2
 # CHECK-NEXT:  |      18.   addq	$16, %rax
 # CHECK-NEXT:  |      19.   decl	%ecx
-# CHECK-NEXT:  |      20.   vmovaps	%xmm0, %xmm3
-# CHECK-NEXT:  |      21.   jne	.LBB0_4
-# CHECK-NEXT:  |
-# CHECK-NEXT:  |    < loop carried >
-# CHECK-NEXT:  |
-# CHECK-NEXT:  +----> 2.    vmulps	-24(%rsp), %xmm7, %xmm8           ## RESOURCE interference:  SKLPort1 [ probability: 45% ]
+# CHECK-NEXT:  +----> 20.   vmovaps	%xmm0, %xmm3                      ## REGISTER dependency:  %xmm0
+# CHECK-NEXT:         21.   jne	.LBB0_4
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps

Modified: llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp?rev=372337&r1=372336&r2=372337&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp (original)
+++ llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp Thu Sep 19 09:05:11 2019
@@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge
            "Unsupported dependency type!");
     OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
   }
-  OS << " - CYCLES: " << DE.Cost << '\n';
+  OS << " - COST: " << DE.Cost << '\n';
 }
 #endif // NDEBUG
 
+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
 void DependencyGraph::initializeRootSet(
     SmallVectorImpl<unsigned> &RootSet) const {
   for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
 }
 
 void DependencyGraph::propagateThroughEdges(
-    SmallVectorImpl<unsigned> &RootSet) {
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
   SmallVector<unsigned, 8> ToVisit;
 
   // A critical sequence is computed as the longest path from a node of the
@@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEd
   // Each node of the graph starts with an initial default cost of zero.  The
   // cost of a node is a measure of criticality: the higher the cost, the bigger
   // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
   //
   // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
   // the inner loop selects (i.e. visits) a node N from a set of `unvisited
@@ -277,6 +304,10 @@ static void printInstruction(formatted_r
 }
 
 void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
   SmallVector<const DependencyEdge *, 16> Seq;
   DG.getCriticalSequence(Seq);
   if (Seq.empty())
@@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
     DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
     return;
@@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(un
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addMemoryDep(From, To + SourceSize, Cost);
     DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
     return;
@@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addResourceDep(From, To + SourceSize, Mask, Cost);
     DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
     return;
@@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const H
 
   // Check if this is the last simulated instruction.
   if (IID == ((Iterations * Source.size()) - 1))
-    DG.finalizeGraph();
+    DG.finalizeGraph(Iterations);
 }
 
 void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {

Modified: llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h?rev=372337&r1=372336&r2=372337&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h (original)
+++ llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h Thu Sep 19 09:05:11 2019
@@ -236,8 +236,9 @@ class DependencyGraph {
   void addDependency(unsigned From, unsigned To,
                      DependencyEdge::Dependency &&DE);
 
+  void pruneEdges(unsigned Iterations);
   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
-  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 
 #ifndef NDEBUG
   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@@ -263,10 +264,11 @@ public:
 
   // Called by the bottleneck analysis at the end of simulation to propagate
   // costs through the edges of the graph, and compute a critical path.
-  void finalizeGraph() {
+  void finalizeGraph(unsigned Iterations) {
     SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
     initializeRootSet(RootSet);
-    propagateThroughEdges(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
   }
 
   // Returns a sequence of edges representing the critical sequence based on the




More information about the llvm-commits mailing list