[llvm] r296845 - CodeGen: BlockPlacement: Precompute layout for chains of triangles.

Kyle Butt via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 2 17:00:23 PST 2017


Author: iteratee
Date: Thu Mar  2 19:00:22 2017
New Revision: 296845

URL: http://llvm.org/viewvc/llvm-project?rev=296845&view=rev
Log:
CodeGen: BlockPlacement: Precompute layout for chains of triangles.

For chains of triangles with small join blocks that can be tail duplicated, a
simple calculation of probabilities is insufficient. Tail duplication
can be profitable in 3 different ways for these cases:

1) The post-dominators marked 50% are actually taken 56% (This shrinks with
   longer chains)
2) The chains are statically correlated. Branch probabilities have a very
   U-shaped distribution.
   [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
   If the branches in a chain are likely to be from the same side of the
   distribution as their predecessor, but are independent at runtime, this
   transformation is profitable. (Because the cost of being wrong is a small
   fixed cost, unlike the standard triangle layout where the cost of being
   wrong scales with the # of triangles.)
3) The chains are dynamically correlated. If the probability that a previous
   branch was taken positively influences whether the next branch will be
   taken
We believe that 2 and 3 are common enough to justify the small margin in 1.

The code pre-scans a function's CFG to identify this pattern and marks the edges
so that the standard layout algorithm can use the computed results.

Modified:
    llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
    llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
    llvm/trunk/test/CodeGen/X86/cmovcmov.ll

Modified: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=296845&r1=296844&r2=296845&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp Thu Mar  2 19:00:22 2017
@@ -50,6 +50,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <forward_list>
 #include <functional>
 #include <utility>
 using namespace llvm;
@@ -143,6 +144,14 @@ static cl::opt<unsigned> TailDupPlacemen
     cl::init(2),
     cl::Hidden);
 
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(3),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
@@ -456,6 +465,9 @@ class MachineBlockPlacement : public Mac
   bool canTailDuplicateUnplacedPreds(
       const MachineBasicBlock *BB, MachineBasicBlock *Succ,
       const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -1041,6 +1053,127 @@ bool MachineBlockPlacement::canTailDupli
   return true;
 }
 
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    unsigned Count;
+    std::forward_list<MachineBasicBlock*> Edges;
+    TriangleChain(MachineBasicBlock* src, MachineBasicBlock *dst) {
+      Edges.push_front(src);
+      Edges.push_front(dst);
+      Count = 1;
+    }
+
+    void append(MachineBasicBlock *dst) {
+      assert(!Edges.empty() && Edges.front()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_front(dst);
+      ++Count;
+    }
+
+    MachineBasicBlock *getKey() {
+      return Edges.front();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert (InsertResult.second && "Block seen twice.");
+      (void) InsertResult;
+    }
+  }
+
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.Count < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.front();
+    Chain.Edges.pop_front();
+    for (MachineBasicBlock *src : Chain.Edges) {
+      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
+            getBlockName(dst) << " as pre-computed based on triangles.\n");
+      ComputedEdges[src] = { dst, true };
+      dst = src;
+    }
+  }
+}
+
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
@@ -2504,6 +2637,7 @@ bool MachineBlockPlacement::runOnMachine
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
   }
 
   assert(BlockToChain.empty());

Modified: llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll?rev=296845&r1=296844&r2=296845&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll Thu Mar  2 19:00:22 2017
@@ -95,6 +95,87 @@ exit:
 }
 
 ; Intended layout:
+; The chain-of-triangles based duplicating produces the layout
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; exit
+; even for 50/50 branches.
+; Tail duplication puts test n+1 at the end of optional n
+; so optional1 includes a copy of test2 at the end, and branches
+; to test3 (at the top) or falls through to optional 2.
+; The CHECK statements check for the whole string of tests
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: straight_test_50:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: .[[OPT1LABEL]]:
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]:
+;CHECK: b .[[EXITLABEL]]
+
+define void @straight_test_50(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
+optional1:
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
+optional2:
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !2
+optional3:
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
+optional4:
+  call void @d()
+  br label %exit
+exit:
+  ret void
+}
+
+; Intended layout:
 ; The chain-based outlining produces the layout
 ; entry
 ; --- Begin loop ---

Modified: llvm/trunk/test/CodeGen/X86/cmovcmov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/cmovcmov.ll?rev=296845&r1=296844&r2=296845&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/cmovcmov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/cmovcmov.ll Thu Mar  2 19:00:22 2017
@@ -249,16 +249,23 @@ attributes #0 = { nounwind }
 ; CMOV-DAG: cmpl %edx, %esi
 ; CMOV-DAG: movb $20, %al
 ; CMOV-DAG: movb $20, %dl
-; CMOV:   jl [[BB0:.LBB[0-9_]+]]
+; CMOV:   jge [[BB2:.LBB[0-9_]+]]
+; CMOV:   jle [[BB3:.LBB[0-9_]+]]
+; CMOV: [[BB0:.LBB[0-9_]+]]
+; CMOV:   testl %edi, %edi
+; CMOV:   jne [[BB4:.LBB[0-9_]+]]
+; CMOV: [[BB1:.LBB[0-9_]+]]
+; CMOV:   movb %al, g8(%rip)
+; CMOV:   retq
+; CMOV: [[BB2]]:
 ; CMOV:   movl %ecx, %edx
-; CMOV: [[BB0]]:
-; CMOV:   jg [[BB1:.LBB[0-9_]+]]
+; CMOV:   jg [[BB0]]
+; CMOV: [[BB3]]:
 ; CMOV:   movl %edx, %eax
-; CMOV: [[BB1]]:
 ; CMOV:   testl %edi, %edi
-; CMOV:   je [[BB2:.LBB[0-9_]+]]
+; CMOV:   je [[BB1]]
+; CMOV: [[BB4]]:
 ; CMOV:   movl %edx, %eax
-; CMOV: [[BB2]]:
 ; CMOV:   movb %al, g8(%rip)
 ; CMOV:   retq
 define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {




More information about the llvm-commits mailing list