[llvm] [CodeLayout] Size-aware machine block placement (PR #109711)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 23 13:31:20 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: None (spupyrev)

<details>
<summary>Changes</summary>

This is an implementation of a new "size-aware" machine block placement. The 
idea is to reorder blocks so that the number of fall-through jumps is maximized.
Observe that profile data is ignored for the optimization, and it is applied only
for instances with hasOptSize()=true.
This strategy has two benefits:
(i) it eliminates jump instructions and hence, smaller binaries; 
(ii) we avoid using profile data while reordering blocks, which yields more 
"uniform" functions, thus helping ICF and machine outliner/merger.

For large (mobile) apps, the size benefits of (i) and (ii) are roughly the same, 
each providing up to 0.5% uncompressed and up to 1% compressed savings size on 
top of the current solution.

The optimization is turned off by default.

---

Patch is 53.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109711.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+354-314) 
- (added) llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll (+134) 


``````````diff
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e29738..3677818c8f08df 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -119,10 +119,10 @@ static cl::opt<unsigned> LoopToColdBlockRatio(
              "(frequency of block) is greater than this ratio"),
     cl::init(5), cl::Hidden);
 
-static cl::opt<bool> ForceLoopColdBlock(
-    "force-loop-cold-block",
-    cl::desc("Force outlining cold blocks from loops."),
-    cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    ForceLoopColdBlock("force-loop-cold-block",
+                       cl::desc("Force outlining cold blocks from loops."),
+                       cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     PreciseRotationCost("precise-rotation-cost",
@@ -147,43 +147,43 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
                                       cl::desc("Cost of jump instructions."),
                                       cl::init(1), cl::Hidden);
 static cl::opt<bool>
-TailDupPlacement("tail-dup-placement",
-              cl::desc("Perform tail duplication during placement. "
-                       "Creates more fallthrough opportunites in "
-                       "outline branches."),
-              cl::init(true), cl::Hidden);
+    TailDupPlacement("tail-dup-placement",
+                     cl::desc("Perform tail duplication during placement. "
+                              "Creates more fallthrough opportunites in "
+                              "outline branches."),
+                     cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
-BranchFoldPlacement("branch-fold-placement",
-              cl::desc("Perform branch folding during placement. "
-                       "Reduces code size."),
-              cl::init(true), cl::Hidden);
+    BranchFoldPlacement("branch-fold-placement",
+                        cl::desc("Perform branch folding during placement. "
+                                 "Reduces code size."),
+                        cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
-             "that won't conflict."), cl::init(2),
-    cl::Hidden);
+             "that won't conflict."),
+    cl::init(2), cl::Hidden);
 
 // Heuristic for aggressive tail duplication.
 static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
     "tail-dup-placement-aggressive-threshold",
     cl::desc("Instruction cutoff for aggressive tail duplication during "
              "layout. Used at -O3. Tail merging during layout is forced to "
-             "have a threshold that won't conflict."), cl::init(4),
-    cl::Hidden);
+             "have a threshold that won't conflict."),
+    cl::init(4), cl::Hidden);
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementPenalty(
     "tail-dup-placement-penalty",
-    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
-             "Copying can increase fallthrough, but it also increases icache "
-             "pressure. This parameter controls the penalty to account for that. "
-             "Percent as integer."),
-    cl::init(2),
-    cl::Hidden);
+    cl::desc(
+        "Cost penalty for blocks that can avoid breaking CFG by copying. "
+        "Copying can increase fallthrough, but it also increases icache "
+        "pressure. This parameter controls the penalty to account for that. "
+        "Percent as integer."),
+    cl::init(2), cl::Hidden);
 
 // Heuristic for tail duplication if profile count is used in cost model.
 static cl::opt<unsigned> TailDupProfilePercentThreshold(
@@ -198,8 +198,7 @@ static cl::opt<unsigned> TriangleChainCount(
     "triangle-chain-count",
     cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
              "triangle tail duplication heuristic to kick in. 0 to disable."),
-    cl::init(2),
-    cl::Hidden);
+    cl::init(2), cl::Hidden);
 
 // Use case: When block layout is visualized after MBP pass, the basic blocks
 // are labeled in layout order; meanwhile blocks could be numbered in a
@@ -219,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
              "block placement."),
     cl::init(UINT_MAX), cl::Hidden);
 
+// Apply the ext-tsp algorithm minimizing the size of a binary.
+static cl::opt<bool>
+    ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
+                       cl::desc("Use ext-tsp for size-aware block placement."));
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -292,8 +296,8 @@ class BlockChain {
   iterator end() { return Blocks.end(); }
   const_iterator end() const { return Blocks.end(); }
 
-  bool remove(MachineBasicBlock* BB) {
-    for(iterator i = begin(); i != end(); ++i) {
+  bool remove(MachineBasicBlock *BB) {
+    for (iterator i = begin(); i != end(); ++i) {
       if (*i == BB) {
         Blocks.erase(i);
         return true;
@@ -405,6 +409,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   ProfileSummaryInfo *PSI = nullptr;
 
+  TargetPassConfig *PassConfig = nullptr;
+
   /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -415,6 +421,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// Partial tail duplication threshold.
   BlockFrequency DupThreshold;
 
+  unsigned TailDupSize;
+
   /// True:  use block profile count to compute tail duplication cost.
   /// False: use block frequency to compute tail duplication cost.
   bool UseProfileCount = false;
@@ -459,26 +467,24 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Scale the DupThreshold according to basic block size.
   BlockFrequency scaleThreshold(MachineBasicBlock *BB);
-  void initDupThreshold();
+  void initTailDupThreshold();
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(
-      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
-      const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(const BlockChain &Chain,
+                           const MachineBasicBlock *LoopHeaderBB,
+                           const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markBlockSuccessors(
-      const BlockChain &Chain, const MachineBasicBlock *BB,
-      const MachineBasicBlock *LoopHeaderBB,
-      const BlockFilterSet *BlockFilter = nullptr);
+  void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
+                           const MachineBasicBlock *LoopHeaderBB,
+                           const BlockFilterSet *BlockFilter = nullptr);
 
   BranchProbability
-  collectViableSuccessors(
-      const MachineBasicBlock *BB, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter,
-      SmallVector<MachineBasicBlock *, 4> &Successors);
+  collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
+                          const BlockFilterSet *BlockFilter,
+                          SmallVector<MachineBasicBlock *, 4> &Successors);
   bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
                        BlockFilterSet *BlockFilter);
   void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
@@ -496,16 +502,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
                           MachineFunction::iterator &PrevUnplacedBlockIt,
                           BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
                           bool &DuplicatedToLPred);
-  bool hasBetterLayoutPredecessor(
-      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-      const BlockChain &SuccChain, BranchProbability SuccProb,
-      BranchProbability RealSuccProb, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter);
-  BlockAndTailDupResult selectBestSuccessor(
-      const MachineBasicBlock *BB, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestCandidateBlock(
-      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
+                                  const MachineBasicBlock *Succ,
+                                  const BlockChain &SuccChain,
+                                  BranchProbability SuccProb,
+                                  BranchProbability RealSuccProb,
+                                  const BlockChain &Chain,
+                                  const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
+                                            const BlockChain &Chain,
+                                            const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *
+  selectBestCandidateBlock(const BlockChain &Chain,
+                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
   MachineBasicBlock *
   getFirstUnplacedBlock(const BlockChain &PlacedChain,
                         MachineFunction::iterator &PrevUnplacedBlockIt);
@@ -536,20 +545,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
                                   const MachineBasicBlock *ExitBB,
                                   const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopTop(
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
-      BlockFrequency &ExitFreq);
+                                           const MachineLoop &L,
+                                           const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet,
+                                      BlockFrequency &ExitFreq);
   BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
   void buildLoopChains(const MachineLoop &L);
-  void rotateLoop(
-      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
-      BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(
-      BlockChain &LoopChain, const MachineLoop &L,
-      const BlockFilterSet &LoopBlockSet);
+  void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+                  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
+                             const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
@@ -558,10 +566,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   bool shouldTailDuplicate(MachineBasicBlock *BB);
   /// Check the edge frequencies to see if tail duplication will increase
   /// fallthroughs.
-  bool isProfitableToTailDup(
-    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-    BranchProbability QProb,
-    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  bool isProfitableToTailDup(const MachineBasicBlock *BB,
+                             const MachineBasicBlock *Succ,
+                             BranchProbability QProb, const BlockChain &Chain,
+                             const BlockFilterSet *BlockFilter);
 
   /// Check for a trellis layout.
   bool isTrellis(const MachineBasicBlock *BB,
@@ -582,16 +590,17 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Returns true if a block can tail duplicate into all unplaced
   /// predecessors. Filters based on loop.
-  bool canTailDuplicateUnplacedPreds(
-      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
-      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
+                                     MachineBasicBlock *Succ,
+                                     const BlockChain &Chain,
+                                     const BlockFilterSet *BlockFilter);
 
   /// Find chains of triangles to tail-duplicate where a global analysis works,
   /// but a local analysis would not find them.
   void precomputeTriangleChains();
 
   /// Apply a post-processing step optimizing block placement.
-  void applyExtTsp();
+  void applyExtTsp(bool OptForSize);
 
   /// Modify the existing block placement in the function and adjust all jumps.
   void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -802,8 +811,8 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
 /// Compare 2 BlockFrequency's with a small penalty for \p A.
 /// In order to be conservative, we apply a X% penalty to account for
 /// increased icache pressure and static heuristics. For small frequencies
-/// we use only the numerators to improve accuracy. For simplicity, we assume the
-/// penalty is less than 100%
+/// we use only the numerators to improve accuracy. For simplicity, we assume
+/// the penalty is less than 100%
 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
 static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
                             BlockFrequency EntryFreq) {
@@ -819,8 +828,8 @@ static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
 /// considering duplication.
 bool MachineBlockPlacement::isProfitableToTailDup(
     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-    BranchProbability QProb,
-    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    BranchProbability QProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   // We need to do a probability calculation to make sure this is profitable.
   // First: does succ have a successor that post-dominates? This affects the
   // calculation. The 2 relevant cases are:
@@ -876,12 +885,12 @@ bool MachineBlockPlacement::isProfitableToTailDup(
   // from BB.
   auto SuccBestPred = BlockFrequency(0);
   for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
-    if (SuccPred == Succ || SuccPred == BB
-        || BlockToChain[SuccPred] == &Chain
-        || (BlockFilter && !BlockFilter->count(SuccPred)))
+    if (SuccPred == Succ || SuccPred == BB ||
+        BlockToChain[SuccPred] == &Chain ||
+        (BlockFilter && !BlockFilter->count(SuccPred)))
       continue;
-    auto Freq = MBFI->getBlockFreq(SuccPred)
-        * MBPI->getEdgeProbability(SuccPred, Succ);
+    auto Freq =
+        MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
     if (Freq > SuccBestPred)
       SuccBestPred = Freq;
   }
@@ -1137,7 +1146,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
   }
   // We have already computed the optimal edge for the other side of the
   // trellis.
-  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+  ComputedEdges[BestB.Src] = {BestB.Dest, false};
 
   auto TrellisSucc = BestA.Dest;
   LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
@@ -1169,8 +1178,8 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
     // Make sure all unplaced and unfiltered predecessors can be
     // tail-duplicated into.
     // Skip any blocks that are already placed or not in this loop.
-    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
-        || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
+        (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
       continue;
     if (!TailDup.canTailDuplicate(Succ, Pred)) {
       if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
@@ -1289,9 +1298,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 
     unsigned count() const { return Edges.size() - 1; }
 
-    MachineBasicBlock *getKey() const {
-      return Edges.back();
-    }
+    MachineBasicBlock *getKey() const { return Edges.back(); }
   };
 
   if (TriangleChainCount == 0)
@@ -1326,7 +1333,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
     bool CanTailDuplicate = true;
     // If PDom can't tail-duplicate into it's non-BB predecessors, then this
     // isn't the kind of triangle we're looking for.
-    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+    for (MachineBasicBlock *Pred : PDom->predecessors()) {
       if (Pred == &BB)
         continue;
       if (!TailDup.canTailDuplicate(PDom, Pred)) {
@@ -1386,8 +1393,8 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
-static BranchProbability getLayoutSuccessorProbThreshold(
-      const MachineBasicBlock *BB) {
+static BranchProbability
+getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction().hasProfileData())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -1551,8 +1558,8 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (Pred == Succ || PredChain == &SuccChain ||
-        (BlockFilter && !BlockFilter->count(Pred)) ||
-        PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
+        (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
+        Pred != *std::prev(PredChain->end()) ||
         // This check is redundant except for look ahead. This function is
         // called for lookahead by isProfitableToTailDup when BB hasn't been
         // placed yet.
@@ -1599,12 +1606,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// \returns The best successor block found, or null if none are viable, along
 /// with a boolean indicating if tail duplication is necessary.
 MachineBlockPlacement::BlockAndTailDupResult
-MachineBlockPlacement::selectBestSuccessor(
-    const MachineBasicBlock *BB, const BlockChain &Chain,
-    const BlockFilterSet *BlockFilter) {
+MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
+                                           const BlockChain &Chain,
+                                           const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  BlockAndTailDupResult BestSucc = { nullptr, false };
+  BlockAndTailDupResult BestSucc = {nullptr, false};
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -1684,8 +1691,8 @@ MachineBlockPlacement::selectBestSuccessor(
     std::tie(DupProb, Succ) = Tup;
     if (DupProb < BestProb)
       break;
-    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
-        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
+        (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
       LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Succ)
                         << ", probability: " << DupProb
                         << " (Tail Duplicate)\n");
@@ -1822,8 +1829,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    const MachineBasicBlock *MBB,
-    SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+    const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *Block...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/109711


More information about the llvm-commits mailing list