[llvm] [CodeLayout] Size-aware machine block placement (PR #109711)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 23 13:06:00 PDT 2024


https://github.com/spupyrev created https://github.com/llvm/llvm-project/pull/109711

None

>From 19211550fb847b4ec869de63047d34e6d85947d7 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev at fb.com>
Date: Mon, 23 Sep 2024 13:02:43 -0700
Subject: [PATCH] [CodeLayout] Size-aware machine block placement

---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp    | 290 +++++++++++-------
 .../X86/code_placement_ext_tsp_size.ll        | 134 ++++++++
 2 files changed, 309 insertions(+), 115 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e29738..3ee10381eef240 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -219,6 +219,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
              "block placement."),
     cl::init(UINT_MAX), cl::Hidden);
 
+// Apply the ext-tsp algorithm minimizing the size of a binary.
+static cl::opt<bool>
+    ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
+                       cl::desc("Use ext-tsp for size-aware block placement."));
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -405,6 +410,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   ProfileSummaryInfo *PSI = nullptr;
 
+  TargetPassConfig *PassConfig = nullptr;
+
   /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -415,6 +422,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// Partial tail duplication threshold.
   BlockFrequency DupThreshold;
 
+  unsigned TailDupSize;
+
   /// True:  use block profile count to compute tail duplication cost.
   /// False: use block frequency to compute tail duplication cost.
   bool UseProfileCount = false;
@@ -459,7 +468,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Scale the DupThreshold according to basic block size.
   BlockFrequency scaleThreshold(MachineBasicBlock *BB);
-  void initDupThreshold();
+  void initTailDupThreshold();
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
@@ -591,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void precomputeTriangleChains();
 
   /// Apply a post-processing step optimizing block placement.
-  void applyExtTsp();
+  void applyExtTsp(bool OptForSize);
 
   /// Modify the existing block placement in the function and adjust all jumps.
   void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -2951,12 +2960,16 @@ void MachineBlockPlacement::alignBlocks() {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F->getFunction().hasMinSize() ||
-      (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
-    return;
+  if (!AlignAllBlock && !AlignAllNonFallThruBlocks) {
+    if (F->getFunction().hasMinSize() ||
+        (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
+      return;
+  }
+
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  // Empty chain.
   if (FunctionChain.begin() == FunctionChain.end())
-    return; // Empty chain.
+    return; 
 
   const BranchProbability ColdProb(1, 5); // 20%
   BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
@@ -3052,6 +3065,33 @@ void MachineBlockPlacement::alignBlocks() {
       DetermineMaxAlignmentPadding();
     }
   }
+
+  const bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
+  if (AlignAllBlock)
+    // Align all of the blocks in the function to a specific alignment.
+    for (MachineBasicBlock &MBB : *F) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
+  else if (AlignAllNonFallThruBlocks) {
+    // Align all of the blocks that have no fall-through predecessors to a
+    // specific alignment.
+    for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
+      auto LayoutPred = std::prev(MBI);
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
+    }
+  }
 }
 
 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
@@ -3407,31 +3447,53 @@ void MachineBlockPlacement::findDuplicateCandidates(
   }
 }
 
-void MachineBlockPlacement::initDupThreshold() {
+void MachineBlockPlacement::initTailDupThreshold() {
   DupThreshold = BlockFrequency(0);
-  if (!F->getFunction().hasProfileData())
-    return;
+  if (F->getFunction().hasProfileData()) {
+    // We prefer to use prifile count.
+    uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
+    if (HotThreshold != UINT64_MAX) {
+      UseProfileCount = true;
+      DupThreshold =
+          BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);      
+    } else {
+      // Profile count is not available, we can use block frequency instead.
+      BlockFrequency MaxFreq = BlockFrequency(0);
+      for (MachineBasicBlock &MBB : *F) {
+        BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
+        if (Freq > MaxFreq)
+          MaxFreq = Freq;
+      }
 
-  // We prefer to use prifile count.
-  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
-  if (HotThreshold != UINT64_MAX) {
-    UseProfileCount = true;
-    DupThreshold =
-        BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
-    return;
+      BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+      DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
+      UseProfileCount = false;
+    }
   }
 
-  // Profile count is not available, we can use block frequency instead.
-  BlockFrequency MaxFreq = BlockFrequency(0);
-  for (MachineBasicBlock &MBB : *F) {
-    BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
-    if (Freq > MaxFreq)
-      MaxFreq = Freq;
+  TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  // For aggressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
   }
 
-  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
-  DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
-  UseProfileCount = false;
+  // If there's no threshold provided through options, query the target
+  // information for a threshold instead.
+  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
+      (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
+       TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
+    TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
 }
 
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
@@ -3451,8 +3513,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getSubtarget().getTargetLowering();
   MPDT = nullptr;
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
-  initDupThreshold();
+  PassConfig = &getAnalysis<TargetPassConfig>();
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
@@ -3463,54 +3524,43 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   assert(ComputedEdges.empty() &&
          "Computed Edge map should be empty before starting placement.");
 
-  unsigned TailDupSize = TailDupPlacementThreshold;
-  // If only the aggressive threshold is explicitly set, use it.
-  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
-      TailDupPlacementThreshold.getNumOccurrences() == 0)
-    TailDupSize = TailDupPlacementAggressiveThreshold;
-
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
-  // For aggressive optimization, we can adjust some thresholds to be less
-  // conservative.
-  if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
-    // At O3 we should be more willing to copy blocks for tail duplication. This
-    // increases size pressure, so we only do it at O3
-    // Do this unless only the regular threshold is explicitly set.
-    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
-        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
-      TailDupSize = TailDupPlacementAggressiveThreshold;
-  }
-
-  // If there's no threshold provided through options, query the target
-  // information for a threshold instead.
-  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
-      (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
-       TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
-    TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
-
+  // Initialize tail duplication thresholds.
+  initTailDupThreshold();
+	
+  const bool OptForSize =
+      MF.getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+  // Use ext-tsp for size optimization is possible only when the function 
+  // contains more than two basic blocks.
+  const bool UseExtTspForSize =
+      OptForSize && ApplyExtTspForSize && MF.size() >= 3;
+
+  // Apply tail duplication.
   if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
-    bool OptForSize = MF.getFunction().hasOptSize() ||
-                      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
     if (OptForSize)
       TailDupSize = 1;
-    bool PreRegAlloc = false;
+    const bool PreRegAlloc = false;
     TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
                    /* LayoutMode */ true, TailDupSize);
-    precomputeTriangleChains();
+    if (!UseExtTspForSize)               
+      precomputeTriangleChains();
   }
 
-  buildCFGChains();
+  // Run the main block placement.
+  if (!UseExtTspForSize)
+    buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
-  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+  const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
                          PassConfig->getEnableTailMerge() &&
-                         BranchFoldPlacement;
+                         BranchFoldPlacement && 
+                         MF.size() > 3;
   // No tail merging opportunities if the block number is less than four.
-  if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDupSize + 1;
+  if (EnableTailMerge) {
+    const unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
                     *MBFI, *MBPI, PSI, TailMergeSize);
 
@@ -3527,15 +3577,20 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Apply a post-processing optimizing block placement.
-  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
-      MF.size() <= ExtTspBlockPlacementMaxBlocks) {
-    // Find a new placement and modify the layout of the blocks in the function.
-    applyExtTsp();
-
-    // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
-    createCFGChainExtTsp();
+	
+  // Apply a post-processing optimizing block placement:
+  // - find a new placement and modify the layout of the blocks in the function;
+  // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
+  if (MF.size() >= 3) {
+    if (EnableExtTspBlockPlacement &&
+        (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+        MF.size() <= ExtTspBlockPlacementMaxBlocks) {
+      applyExtTsp(false);
+      createCFGChainExtTsp();
+    } else if (UseExtTspForSize) {
+      applyExtTsp(true);
+      createCFGChainExtTsp();
+    }
   }
 
   optimizeBranches();
@@ -3545,32 +3600,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
-  bool HasMaxBytesOverride =
-      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
-
-  if (AlignAllBlock)
-    // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF) {
-      if (HasMaxBytesOverride)
-        MBB.setAlignment(Align(1ULL << AlignAllBlock),
-                         MaxBytesForAlignmentOverride);
-      else
-        MBB.setAlignment(Align(1ULL << AlignAllBlock));
-    }
-  else if (AlignAllNonFallThruBlocks) {
-    // Align all of the blocks that have no fall-through predecessors to a
-    // specific alignment.
-    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
-      auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI)) {
-        if (HasMaxBytesOverride)
-          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
-                            MaxBytesForAlignmentOverride);
-        else
-          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
-      }
-    }
-  }
+  // View the function.
   if (ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F->getFunction().getName() == ViewBlockFreqFuncName)) {
@@ -3584,7 +3614,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void MachineBlockPlacement::applyExtTsp() {
+void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
   // Prepare data; blocks are indexed by their index in the current ordering.
   DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
   BlockIndex.reserve(F->size());
@@ -3596,13 +3626,15 @@ void MachineBlockPlacement::applyExtTsp() {
     CurrentBlockOrder.push_back(&MBB);
   }
 
-  auto BlockSizes = std::vector<uint64_t>(F->size());
-  auto BlockCounts = std::vector<uint64_t>(F->size());
+  std::vector<uint64_t> BlockCounts(F->size());
+  std::vector<uint64_t> BlockSizes(F->size());
   std::vector<codelayout::EdgeCount> JumpCounts;
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+  SmallVector<const MachineBasicBlock *, 4> Succs;  
   for (MachineBasicBlock &MBB : *F) {
     // Getting the block frequency.
     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
-    BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+    BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
     // Getting the block size:
     // - approximate the size of an instruction by 4 bytes, and
     // - ignore debug instructions.
@@ -3611,24 +3643,49 @@ void MachineBlockPlacement::applyExtTsp() {
     // not see a perf improvement with the exact block sizes.
     auto NonDbgInsts =
         instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
-    int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+    size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
     BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
     // Getting jump frequencies.
-    for (MachineBasicBlock *Succ : MBB.successors()) {
-      auto EP = MBPI->getEdgeProbability(&MBB, Succ);
-      BlockFrequency JumpFreq = BlockFreq * EP;
-      JumpCounts.push_back(
-          {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+	
+    if (!OptForSize) {
+      for (MachineBasicBlock *Succ : MBB.successors()) {
+        auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+        BlockFrequency JumpFreq = BlockFreq * EP;
+        JumpCounts.push_back(
+            {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+      }
+    } else {
+      Cond.clear();
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+      if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+        continue;
+
+      const MachineBasicBlock *FTB = MBB.getFallThrough();
+
+      Succs.clear();
+      if (TBB && TBB != FTB)
+        Succs.push_back(TBB);
+      if (FBB && FBB != FTB)
+        Succs.push_back(FBB);
+      if (FTB)
+        Succs.push_back(FTB);
+      // Absolute magnitude of non-zero counts does not matter for the
+      // optimization; prioritize slightly jumps with a single successor, since
+      // the corresponding jump instruction will be removed from the binary.
+      const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
+      for (const MachineBasicBlock *Succ : Succs) {
+        JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
+      }
     }
-  }
+  }    
 
   LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
                     << " with profile = " << F->getFunction().hasProfileData()
                     << " (" << F->getName().str() << ")"
                     << "\n");
-  LLVM_DEBUG(
-      dbgs() << format("  original  layout score: %0.2f\n",
-                       calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
+	
+  const double OrgScore = calcExtTspScore(BlockSizes, BlockCounts, JumpCounts);
+  LLVM_DEBUG(dbgs() << format("  original  layout score: %0.2f\n", OrgScore));                    
 
   // Run the layout algorithm.
   auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3637,12 +3694,15 @@ void MachineBlockPlacement::applyExtTsp() {
   for (uint64_t Node : NewOrder) {
     NewBlockOrder.push_back(CurrentBlockOrder[Node]);
   }
-  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n",
-                              calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
-                                              JumpCounts)));
+  const double OptScore =
+      calcExtTspScore(NewOrder, BlockSizes, BlockCounts, JumpCounts);
+  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n", OptScore));  
 
-  // Assign new block order.
-  assignBlockOrder(NewBlockOrder);
+  // If the optimization is unsuccessful, fall back to the original block order.
+  if (OptForSize && OrgScore > OptScore)
+    assignBlockOrder(CurrentBlockOrder);
+  else
+    assignBlockOrder(NewBlockOrder);
 }
 
 void MachineBlockPlacement::assignBlockOrder(
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
new file mode 100644
index 00000000000000..acec469eff230c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=1 < %s | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=0 < %s | FileCheck %s -check-prefix=CHECK2
+
+define void @func1() #0 {
+;
+; +-----+
+; | b0  | -+
+; +-----+  |
+;   |      |
+;   | 10   |
+;   v      |
+; +-----+  |
+; | b1  |  | 10000
+; +-----+  |
+;   |      |
+;   | 10   |
+;   v      |
+; +-----+  |
+; | b2  | <+
+; +-----+
+;
+; CHECK-LABEL: func1:
+; CHECK: %b0
+; CHECK: %b1
+; CHECK: %b2
+;
+; CHECK2-LABEL: func1:
+; CHECK2: %b0
+; CHECK2: %b2
+; CHECK2: %b1
+
+b0:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %b1, label %b2, !prof !1
+
+b1:
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %b2
+
+b2:
+  call void @e()
+  ret void
+}
+
+define void @func_loop() #1 !prof !9 {
+; Test that the algorithm can rotate loops in the presence of profile data.
+;
+;                  +--------+
+;                  | entry  |
+;                  +--------+
+;                    |
+;                    | 1
+;                    v
+; +--------+  16   +--------+
+; | if.then| <---- | header | <+
+; +--------+       +--------+  |
+;   |                |         |
+;   |                | 160     |
+;   |                v         |
+;   |              +--------+  |
+;   |              | if.else|  | 175
+;   |              +--------+  |
+;   |                |         |
+;   |                | 160     |
+;   |                v         |
+;   |        16    +--------+  |
+;   +------------> | if.end | -+
+;                  +--------+
+;                    |
+;                    | 1
+;                    v
+;                  +--------+
+;                  |  end   |
+;                  +--------+
+;
+; CHECK-LABEL: func_loop:
+; CHECK: %entry
+; CHECK: %header
+; CHECK: %if.then
+; CHECK: %if.else
+; CHECK: %if.end
+; CHECK: %end
+;
+; CHECK2-LABEL: func_loop:
+; CHECK2: %entry
+; CHECK2: %header
+; CHECK2: %if.else
+; CHECK2: %if.end
+; CHECK2: %if.then
+; CHECK2: %end
+
+entry:
+  br label %header
+
+header:
+  call void @e()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !10
+
+if.then:
+  call void @f()
+  br label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end
+
+end:
+  ret void
+}
+
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @g()
+declare void @f()
+declare void @h()
+
+!1 = !{!"branch_weights", i32 10, i32 10000}
+!9 = !{!"function_entry_count", i64 1}
+!10 = !{!"branch_weights", i32 16, i32 160}
+
+attributes #0 = { minsize }
+attributes #1 = { minsize }



More information about the llvm-commits mailing list