[llvm] 369d086 - [MBP] Partial tail duplication into hot predecessors

Guozhi Wei via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 12 15:24:08 PST 2020


Author: Guozhi Wei
Date: 2020-02-12T15:22:33-08:00
New Revision: 369d086d784e0c8cc7add9823f319e95f95dbd81

URL: https://github.com/llvm/llvm-project/commit/369d086d784e0c8cc7add9823f319e95f95dbd81
DIFF: https://github.com/llvm/llvm-project/commit/369d086d784e0c8cc7add9823f319e95f95dbd81.diff

LOG: [MBP] Partial tail duplication into hot predecessors

Current tail duplication embedded in MBP duplicates a BB into all or none of its predecessors without too much cost analysis. So sometimes it is duplicated into cold predecessors, and in other cases it may miss the duplication into hot predecessors.

This patch improves tail duplication in 3 aspects:

  A successor can be duplicated into part of its predecessors.
  A more fine-grained benefit analysis, combined with 1, now a successor is duplicated into hot predecessors only.
  If a successor can't be duplicated into one predecessor, it doesn't impact the duplication into other predecessors.

Differential Revision: https://reviews.llvm.org/D73387

Added: 
    llvm/test/CodeGen/X86/partial-tail-dup.ll

Modified: 
    llvm/include/llvm/CodeGen/TailDuplicator.h
    llvm/lib/CodeGen/MachineBlockPlacement.cpp
    llvm/lib/CodeGen/TailDuplicator.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h
index 5b2de1c32072..0903b93e50eb 100644
--- a/llvm/include/llvm/CodeGen/TailDuplicator.h
+++ b/llvm/include/llvm/CodeGen/TailDuplicator.h
@@ -87,11 +87,13 @@ class TailDuplicator {
   /// of predecessors that received a copy of \p MBB.
   /// If \p RemovalCallback is non-null. It will be called before MBB is
   /// deleted.
+  /// If \p CandidatePtr is not null, duplicate into these blocks only.
   bool tailDuplicateAndUpdate(
       bool IsSimple, MachineBasicBlock *MBB,
       MachineBasicBlock *ForcedLayoutPred,
       SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds = nullptr,
-      function_ref<void(MachineBasicBlock *)> *RemovalCallback = nullptr);
+      function_ref<void(MachineBasicBlock *)> *RemovalCallback = nullptr,
+      SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr);
 
 private:
   using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
@@ -119,7 +121,8 @@ class TailDuplicator {
                      MachineBasicBlock *TailBB,
                      MachineBasicBlock *ForcedLayoutPred,
                      SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                     SmallVectorImpl<MachineInstr *> &Copies);
+                     SmallVectorImpl<MachineInstr *> &Copies,
+                     SmallVectorImpl<MachineBasicBlock *> *CandidatePtr);
   void appendCopies(MachineBasicBlock *MBB,
                  SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
                  SmallVectorImpl<MachineInstr *> &Copies);

diff  --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 06e2f46f892f..31491513bd96 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -374,6 +374,9 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
+  /// Partial tail duplication threshold.
+  BlockFrequency DupThreshold;
+
   /// Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -399,6 +402,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
 #endif
 
+  /// Scale the DupThreshold according to basic block size.
+  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
+  void initDupThreshold();
+
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markChainSuccessors(
@@ -421,6 +428,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
       const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
       const BlockChain &Chain, const BlockFilterSet *BlockFilter,
       BranchProbability SuccProb, BranchProbability HotProb);
+  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
+                       BlockFilterSet *BlockFilter);
+  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
+                               MachineBasicBlock *BB,
+                               BlockFilterSet *BlockFilter);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
       const MachineBasicBlock *LoopHeaderBB,
@@ -1141,6 +1153,11 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
   if (NumDup == 0)
     return false;
 
+  // If profile information is available, findDuplicateCandidates can do more
+  // precise benefit analysis.
+  if (F->getFunction().hasProfileData())
+    return true;
+
   // This is mainly for function exit BB.
   // The integrated tail duplication is really designed for increasing
   // fallthrough from predecessors from Succ to its successors. We may need
@@ -1169,9 +1186,6 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
   //
   // A small number of extra duplication may not hurt too much. We need a better
   // heuristic to handle it.
-  //
-  // FIXME: we should selectively tail duplicate a BB into part of its
-  // predecessors.
   if ((NumDup > Succ->succ_size()) || !Duplicate)
     return false;
 
@@ -1799,11 +1813,11 @@ void MachineBlockPlacement::buildChain(
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
     if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
-      // If the chosen successor was duplicated into all its predecessors,
-      // don't bother laying it out, just go round the loop again with BB as
-      // the chain end.
-      if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
-                                       BlockFilter, PrevUnplacedBlockIt))
+      repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
+                                       BlockFilter, PrevUnplacedBlockIt);
+      // If the chosen successor was duplicated into BB, don't bother laying
+      // it out, just go round the loop again with BB as the chain end.
+      if (!BB->isSuccessor(BestSucc))
         continue;
     }
 
@@ -3011,8 +3025,18 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
   bool IsSimple = TailDup.isSimpleBB(BB);
-  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
-                                 &DuplicatedPreds, &RemovalCallbackRef);
+  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
+  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
+  if (F->getFunction().hasProfileData()) {
+    // We can do partial duplication with precise profile information.
+    findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
+    if (CandidatePreds.size() == 0)
+      return false;
+    if (CandidatePreds.size() < BB->pred_size())
+      CandidatePtr = &CandidatePreds;
+  }
+  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
+                                 &RemovalCallbackRef, CandidatePtr);
 
   // Update UnscheduledPredecessors to reflect tail-duplication.
   DuplicatedToLPred = false;
@@ -3035,6 +3059,191 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   return Removed;
 }
 
+// Count the number of actual machine instructions.
+static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
+  uint64_t InstrCount = 0;
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isPHI() && !MI.isMetaInstruction())
+      InstrCount += 1;
+  }
+  return InstrCount;
+}
+
+// The size cost of duplication is the instruction size of the duplicated block.
+// So we should scale the threshold accordingly. But the instruction size is not
+// available on all targets, so we use the number of instructions instead.
+BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
+  return DupThreshold.getFrequency() * countMBBInstruction(BB);
+}
+
+// Returns true if BB is Pred's best successor.
+bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
+                                            MachineBasicBlock *Pred,
+                                            BlockFilterSet *BlockFilter) {
+  if (BB == Pred)
+    return false;
+  if (BlockFilter && !BlockFilter->count(Pred))
+    return false;
+  BlockChain *PredChain = BlockToChain[Pred];
+  if (PredChain && (Pred != *std::prev(PredChain->end())))
+    return false;
+
+  // Find the successor with largest probability excluding BB.
+  BranchProbability BestProb = BranchProbability::getZero();
+  for (MachineBasicBlock *Succ : Pred->successors())
+    if (Succ != BB) {
+      if (BlockFilter && !BlockFilter->count(Succ))
+        continue;
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain && (Succ != *SuccChain->begin()))
+        continue;
+      BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+      if (SuccProb > BestProb)
+        BestProb = SuccProb;
+    }
+
+  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
+  if (BBProb <= BestProb)
+    return false;
+
+  // Compute the number of reduced taken branches if Pred falls through to BB
+  // instead of another successor. Then compare it with threshold.
+  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
+  return Gain > scaleThreshold(BB);
+}
+
+// Find out the predecessors of BB and BB can be beneficially duplicated into
+// them.
+void MachineBlockPlacement::findDuplicateCandidates(
+    SmallVectorImpl<MachineBasicBlock *> &Candidates,
+    MachineBasicBlock *BB,
+    BlockFilterSet *BlockFilter) {
+  MachineBasicBlock *Fallthrough = nullptr;
+  BranchProbability DefaultBranchProb = BranchProbability::getZero();
+  BlockFrequency BBDupThreshold(scaleThreshold(BB));
+  SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());
+  SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());
+
+  // Sort for highest frequency.
+  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
+  };
+  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  };
+  llvm::stable_sort(Succs, CmpSucc);
+  llvm::stable_sort(Preds, CmpPred);
+
+  auto SuccIt = Succs.begin();
+  if (SuccIt != Succs.end()) {
+    DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
+  }
+
+  // For each predecessors of BB, compute the benefit of duplicating BB,
+  // if it is larger than the threshold, add it into Candidates.
+  //
+  // If we have following control flow.
+  //
+  //     PB1 PB2 PB3 PB4
+  //      \   |  /    /\
+  //       \  | /    /  \
+  //        \ |/    /    \
+  //         BB----/     OB
+  //         /\
+  //        /  \
+  //      SB1 SB2
+  //
+  // And it can be partially duplicated as
+  //
+  //   PB2+BB
+  //      |  PB1 PB3 PB4
+  //      |   |  /    /\
+  //      |   | /    /  \
+  //      |   |/    /    \
+  //      |  BB----/     OB
+  //      |\ /|
+  //      | X |
+  //      |/ \|
+  //     SB2 SB1
+  //
+  // The benefit of duplicating into a predecessor is defined as
+  //         Orig_taken_branch - Duplicated_taken_branch
+  //
+  // The Orig_taken_branch is computed with the assumption that predecessor
+  // jumps to BB and the most possible successor is laid out after BB.
+  //
+  // The Duplicated_taken_branch is computed with the assumption that BB is
+  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
+  // SB2 for PB2 in our case). If there is no available successor, the combined
+  // block jumps to all BB's successor, like PB3 in this example.
+  //
+  // If a predecessor has multiple successors, so BB can't be duplicated into
+  // it. But it can beneficially fall through to BB, and duplicate BB into other
+  // predecessors.
+  for (MachineBasicBlock *Pred : Preds) {
+    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+
+    if (!TailDup.canTailDuplicate(BB, Pred)) {
+      // BB can't be duplicated into Pred, but it is possible to be layout
+      // below Pred.
+      if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
+        Fallthrough = Pred;
+        if (SuccIt != Succs.end())
+          SuccIt++;
+      }
+      continue;
+    }
+
+    BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
+    BlockFrequency DupCost;
+    if (SuccIt == Succs.end()) {
+      // Jump to all successors;
+      if (Succs.size() > 0)
+        DupCost += PredFreq;
+    } else {
+      // Fallthrough to *SuccIt, jump to all other successors;
+      DupCost += PredFreq;
+      DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
+    }
+
+    assert(OrigCost >= DupCost);
+    OrigCost -= DupCost;
+    if (OrigCost > BBDupThreshold) {
+      Candidates.push_back(Pred);
+      if (SuccIt != Succs.end())
+        SuccIt++;
+    }
+  }
+
+  // No predecessors can optimally fallthrough to BB.
+  // So we can change one duplication into fallthrough.
+  if (!Fallthrough) {
+    if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
+      Candidates[0] = Candidates.back();
+      Candidates.pop_back();
+    }
+  }
+}
+
+void MachineBlockPlacement::initDupThreshold() {
+  DupThreshold = 0;
+  if (!F->getFunction().hasProfileData())
+    return;
+
+  BlockFrequency MaxFreq = 0;
+  for (MachineBasicBlock &MBB : *F) {
+    BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
+    if (Freq > MaxFreq)
+      MaxFreq = Freq;
+  }
+
+  // FIXME: we may use profile count instead of frequency,
+  // and need more fine tuning.
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  DupThreshold = MaxFreq * ThresholdProb;
+}
+
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -3053,6 +3262,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MPDT = nullptr;
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
+  initDupThreshold();
+
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;

diff  --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 87db1647034b..d9518ff252bf 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -159,14 +159,16 @@ bool TailDuplicator::tailDuplicateAndUpdate(
     bool IsSimple, MachineBasicBlock *MBB,
     MachineBasicBlock *ForcedLayoutPred,
     SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
-    function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback,
+    SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
 
   SmallVector<MachineBasicBlock *, 8> TDBBs;
   SmallVector<MachineInstr *, 16> Copies;
-  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies))
+  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred,
+                     TDBBs, Copies, CandidatePtr))
     return false;
 
   ++NumTails;
@@ -804,9 +806,10 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
 /// \p Copies            A vector of copy instructions inserted. Used later to
 ///                      walk all the inserted copies and remove redundant ones.
 bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
-                                   MachineBasicBlock *ForcedLayoutPred,
-                                   SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                                   SmallVectorImpl<MachineInstr *> &Copies) {
+                          MachineBasicBlock *ForcedLayoutPred,
+                          SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                          SmallVectorImpl<MachineInstr *> &Copies,
+                          SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
   LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB)
                     << '\n');
 
@@ -820,8 +823,12 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
   // block into them, if possible. Copying the list ahead of time also
   // avoids trouble with the predecessor list reallocating.
   bool Changed = false;
-  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
-                                               TailBB->pred_end());
+  SmallSetVector<MachineBasicBlock *, 8> Preds;
+  if (CandidatePtr)
+    Preds.insert(CandidatePtr->begin(), CandidatePtr->end());
+  else
+    Preds.insert(TailBB->pred_begin(), TailBB->pred_end());
+
   for (MachineBasicBlock *PredBB : Preds) {
     assert(TailBB != PredBB &&
            "Single-block loop should have been rejected earlier!");
@@ -830,13 +837,17 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
       continue;
 
     // Don't duplicate into a fall-through predecessor (at least for now).
-    bool IsLayoutSuccessor = false;
-    if (ForcedLayoutPred)
-      IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
-    else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
-      IsLayoutSuccessor = true;
-    if (IsLayoutSuccessor)
-      continue;
+    // If profile is available, findDuplicateCandidates can choose better
+    // fall-through predecessor.
+    if (!(MF->getFunction().hasProfileData() && LayoutMode)) {
+      bool IsLayoutSuccessor = false;
+      if (ForcedLayoutPred)
+        IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
+      else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+        IsLayoutSuccessor = true;
+      if (IsLayoutSuccessor)
+        continue;
+    }
 
     LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
                       << "From Succ: " << *TailBB);

diff  --git a/llvm/test/CodeGen/X86/partial-tail-dup.ll b/llvm/test/CodeGen/X86/partial-tail-dup.ll
new file mode 100644
index 000000000000..b8d860ef671b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/partial-tail-dup.ll
@@ -0,0 +1,187 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+
+ at gvar = external global i32
+
+; dupbb has two predecessors, p1 and p2. p1 is hot, p2 is cold. So dupbb
+; should be placed after p1, and not duplicated into p2.
+;
+; CHECK-LABEL: test1
+; CHECK:       %p1
+; CHECK:       .LBB0_4: # %dupbb
+; CHECK:       %p2
+; CHECK:       jmp .LBB0_4
+
+define void @test1(i32* %p) !prof !1 {
+entry:
+  br label %header
+
+header:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %p1, label %p2, !prof !2
+
+p1:
+  call void @b()
+  br label %dupbb
+
+p2:
+  call void @c()
+  br label %dupbb
+
+dupbb:
+  %cond = icmp eq i32* @gvar, %p
+  br i1 %cond, label %header, label %latch, !prof !3
+
+latch:
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header, label %end, !prof !2
+
+end:
+  ret void
+}
+
+
+; dupbb has four predecessors p1, p2, p3 and p4. p1 and p2 are hot, p3 and  p4
+; are cold. So dupbb should be placed after p1, duplicated into p2. p3 and p4
+; should jump to dupbb.
+;
+; CHECK-LABEL: test2
+; CHECK:       %p1
+; CHECK:       .LBB1_8: # %dupbb
+;
+; CHECK:       %p2
+; CHECK:       callq c
+; CHECK-NEXT:  cmpq
+; CHECK-NEXT:  je
+; CHECK-NEXT:  jmp
+;
+; CHECK:       %p3
+; CHECK:       jmp .LBB1_8
+; CHECK:       %p4
+; CHECK:       jmp .LBB1_8
+
+define void @test2(i32* %p) !prof !1 {
+entry:
+  br label %header
+
+header:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %bb1, label %bb2, !prof !2
+
+bb1:
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %p1, label %p2, !prof !4
+
+bb2:
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %p3, label %p4, !prof !4
+
+p1:
+  call void @b()
+  br label %dupbb
+
+p2:
+  call void @c()
+  br label %dupbb
+
+p3:
+  call void @d()
+  br label %dupbb
+
+p4:
+  call void @e()
+  br label %dupbb
+
+dupbb:
+  %cond = icmp eq i32* @gvar, %p
+  br i1 %cond, label %bb3, label %bb4, !prof !4
+
+bb3:
+  call void @b()
+  br label %bb4
+
+bb4:
+  %call4 = call zeroext i1 @a()
+  br i1 %call4, label %header, label %latch, !prof !3
+
+latch:
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header, label %end, !prof !2
+
+end:
+  ret void
+}
+
+
+; dupbb has three predecessors p1, p2 and p3. p3 has two successors, so dupbb
+; can't be duplicated into p3, but it should not block it to be duplicated into
+; other predecessors.
+;
+; CHECK-LABEL: test3
+; CHECK:       %p1
+; CHECK:       .LBB2_6: # %dupbb
+;
+; CHECK:       %p2
+; CHECK:       callq c
+; CHECK:       cmpq
+; CHECK-NEXT:  je
+; CHECK-NEXT:  jmp
+;
+; CHECK:       %p3
+; CHECK:       jne .LBB2_6
+
+define void @test3(i32* %p) !prof !1 {
+entry:
+  br label %header
+
+header:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %bb1, label %p3, !prof !2
+
+bb1:
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %p1, label %p2, !prof !4
+
+p1:
+  call void @b()
+  br label %dupbb
+
+p2:
+  call void @c()
+  br label %dupbb
+
+p3:
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %dupbb, label %bb4, !prof !4
+
+dupbb:
+  %cond = icmp eq i32* @gvar, %p
+  br i1 %cond, label %bb3, label %bb4, !prof !4
+
+bb3:
+  call void @b()
+  br label %bb4
+
+bb4:
+  %call4 = call zeroext i1 @a()
+  br i1 %call4, label %header, label %latch, !prof !3
+
+latch:
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header, label %end, !prof !2
+
+end:
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @f()
+
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 100}
+!4 = !{!"branch_weights", i32 60, i32 40}


        


More information about the llvm-commits mailing list