[llvm] 9016f27 - [CodeLayout] Size-aware machine block placement (#109711)

Wed Oct 2 10:48:12 PDT 2024

Author: spupyrev
Date: 2024-10-02T10:48:08-07:00
New Revision: 9016f27c4253ac5c6282c30f0fe08ddd58522cdd

URL: https://github.com/llvm/llvm-project/commit/9016f27c4253ac5c6282c30f0fe08ddd58522cdd
DIFF: https://github.com/llvm/llvm-project/commit/9016f27c4253ac5c6282c30f0fe08ddd58522cdd.diff

LOG: [CodeLayout] Size-aware machine block placement (#109711)

This is an implementation of a new "size-aware" machine block placement.
The
idea is to reorder blocks so that the number of fall-through jumps is
maximized.
Observe that profile data is ignored for the optimization, and it is
applied only
for instances with hasOptSize()=true.
This strategy has two benefits:
(i) it eliminates jump instructions, which results in smaller text size;
(ii) we avoid using profile data while reordering blocks, which yields
more
"uniform" functions, thus helping ICF and machine outliner/merger.

For large (mobile) apps, the size benefits of (i) and (ii) are roughly
the same,
combined providing up to 0.5% uncompressed and up to 1% compressed
savings size
on top of the current solution.

The optimization is turned off by default.

Added: 
    llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll

Modified: 
    llvm/lib/CodeGen/MachineBlockPlacement.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 7807875c06584c..c42e63202c3b5a 100644

--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
              "block placement."),
     cl::init(UINT_MAX), cl::Hidden);
 
+// Apply the ext-tsp algorithm minimizing the size of a binary.
+static cl::opt<bool>
+    ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
+                       cl::desc("Use ext-tsp for size-aware block placement."));
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void precomputeTriangleChains();
 
   /// Apply a post-processing step optimizing block placement.
-  void applyExtTsp();
+  void applyExtTsp(bool OptForSize);
 
   /// Modify the existing block placement in the function and adjust all jumps.
   void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   // Initialize tail duplication thresholds.
   initTailDupThreshold();
 
+  const bool OptForSize =
+      MF.getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+  // Determine whether to use ext-tsp for perf/size optimization. The method
+  // is beneficial only for instances with at least 3 basic blocks and it can be
+  // disabled for huge functions (exceeding a certain size).
+  bool UseExtTspForPerf = false;
+  bool UseExtTspForSize = false;
+  if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
+    UseExtTspForPerf =
+        EnableExtTspBlockPlacement &&
+        (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData());
+    UseExtTspForSize = OptForSize && ApplyExtTspForSize;
+  }
+
   // Apply tail duplication.
   if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
-    bool OptForSize = MF.getFunction().hasOptSize() ||
-                      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
     if (OptForSize)
       TailDupSize = 1;
     const bool PreRegAlloc = false;
     TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
                    /* LayoutMode */ true, TailDupSize);
-    precomputeTriangleChains();
+    if (!UseExtTspForSize)
+      precomputeTriangleChains();
   }
 
-  buildCFGChains();
+  // Run the main block placement.
+  if (!UseExtTspForSize)
+    buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
   // TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Apply a post-processing optimizing block placement.
-  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
-      MF.size() <= ExtTspBlockPlacementMaxBlocks) {
-    // Find a new placement and modify the layout of the blocks in the function.
-    applyExtTsp();
-
-    // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
+  // Apply a post-processing optimizing block placement:
+  // - find a new placement and modify the layout of the blocks in the function;
+  // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
+  if (UseExtTspForPerf || UseExtTspForSize) {
+    assert(
+        !(UseExtTspForPerf && UseExtTspForSize) &&
+        "UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly");
+    applyExtTsp(/*OptForSize=*/UseExtTspForSize);
     createCFGChainExtTsp();
   }
 
@@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void MachineBlockPlacement::applyExtTsp() {
+void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
   // Prepare data; blocks are indexed by their index in the current ordering.
   DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
   BlockIndex.reserve(F->size());
@@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
     CurrentBlockOrder.push_back(&MBB);
   }
 
-  auto BlockSizes = std::vector<uint64_t>(F->size());
-  auto BlockCounts = std::vector<uint64_t>(F->size());
-  std::vector<codelayout::EdgeCount> JumpCounts;
+  SmallVector<uint64_t, 0> BlockCounts(F->size());
+  SmallVector<uint64_t, 0> BlockSizes(F->size());
+  SmallVector<codelayout::EdgeCount, 0> JumpCounts;
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+  SmallVector<const MachineBasicBlock *, 4> Succs;
   for (MachineBasicBlock &MBB : *F) {
     // Getting the block frequency.
     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
-    BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+    BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
     // Getting the block size:
     // - approximate the size of an instruction by 4 bytes, and
     // - ignore debug instructions.
@@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
     // not see a perf improvement with the exact block sizes.
     auto NonDbgInsts =
         instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
-    int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+    size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
     BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
+
     // Getting jump frequencies.
-    for (MachineBasicBlock *Succ : MBB.successors()) {
-      auto EP = MBPI->getEdgeProbability(&MBB, Succ);
-      BlockFrequency JumpFreq = BlockFreq * EP;
-      JumpCounts.push_back(
-          {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+    if (OptForSize) {
+      Cond.clear();
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+      if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+        continue;
+
+      const MachineBasicBlock *FTB = MBB.getFallThrough();
+      // Succs is a collection of distinct destinations of the block reachable
+      // from MBB via a jump instruction; initialize the list using the three
+      // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
+      Succs.clear();
+      if (TBB && TBB != FTB)
+        Succs.push_back(TBB);
+      if (FBB && FBB != FTB)
+        Succs.push_back(FBB);
+      if (FTB)
+        Succs.push_back(FTB);
+      // Absolute magnitude of non-zero counts does not matter for the
+      // optimization; prioritize slightly jumps with a single successor, since
+      // the corresponding jump instruction will be removed from the binary.
+      const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
+      for (const MachineBasicBlock *Succ : Succs)
+        JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
+    } else {
+      for (MachineBasicBlock *Succ : MBB.successors()) {
+        auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+        BlockFrequency JumpFreq = BlockFreq * EP;
+        JumpCounts.push_back(
+            {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
+      }
     }
   }
 
   LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
                     << " with profile = " << F->getFunction().hasProfileData()
-                    << " (" << F->getName().str() << ")"
-                    << "\n");
-  LLVM_DEBUG(dbgs() << format("  original  layout score: %0.2f\n",
-                              calcExtTspScore(BlockSizes, JumpCounts)));
+                    << " (" << F->getName() << ")" << "\n");
+
+  const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
+  LLVM_DEBUG(dbgs() << format("  original  layout score: %0.2f\n", OrgScore));
 
   // Run the layout algorithm.
   auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
   for (uint64_t Node : NewOrder) {
     NewBlockOrder.push_back(CurrentBlockOrder[Node]);
   }
-  LLVM_DEBUG(
-      dbgs() << format("  optimized layout score: %0.2f\n",
-                       calcExtTspScore(NewOrder, BlockSizes, JumpCounts)));
+  const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
+  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n", OptScore));
 
-  // Assign new block order.
-  assignBlockOrder(NewBlockOrder);
+  // If the optimization is unsuccessful, fall back to the original block order.
+  if (OptForSize && OrgScore > OptScore)
+    assignBlockOrder(CurrentBlockOrder);
+  else
+    assignBlockOrder(NewBlockOrder);
 }
 
 void MachineBlockPlacement::assignBlockOrder(

diff  --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
new file mode 100644
index 00000000000000..59eaf2586f1737
--- /dev/null
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
@@ -0,0 +1,131 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true  < %s | FileCheck %s -check-prefix=CHECK-PERF
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE
+
+define void @func1() minsize {
+;
+; +-----+
+; | b0  | -+
+; +-----+  |
+;   |      |
+;   | 10   |
+;   v      |
+; +-----+  |
+; | b1  |  | 10000
+; +-----+  |
+;   |      |
+;   | 10   |
+;   v      |
+; +-----+  |
+; | b2  | <+
+; +-----+
+;
+; CHECK-PERF-LABEL: func1:
+; CHECK-PERF: %b0
+; CHECK-PERF: %b1
+; CHECK-PERF: %b2
+;
+; CHECK-SIZE-LABEL: func1:
+; CHECK-SIZE: %b0
+; CHECK-SIZE: %b2
+; CHECK-SIZE: %b1
+
+b0:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %b1, label %b2, !prof !1
+
+b1:
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %b2
+
+b2:
+  call void @e()
+  ret void
+}
+
+define void @func_loop() minsize !prof !9 {
+; Test that the algorithm can rotate loops in the presence of profile data.
+;
+;                  +--------+
+;                  | entry  |
+;                  +--------+
+;                    |
+;                    | 1
+;                    v
+; +--------+  16   +--------+
+; | if.then| <---- | header | <+
+; +--------+       +--------+  |
+;   |                |         |
+;   |                | 160     |
+;   |                v         |
+;   |              +--------+  |
+;   |              | if.else|  | 175
+;   |              +--------+  |
+;   |                |         |
+;   |                | 160     |
+;   |                v         |
+;   |        16    +--------+  |
+;   +------------> | if.end | -+
+;                  +--------+
+;                    |
+;                    | 1
+;                    v
+;                  +--------+
+;                  |  end   |
+;                  +--------+
+;
+; CHECK-PERF-LABEL: func_loop:
+; CHECK-PERF: %entry
+; CHECK-PERF: %header
+; CHECK-PERF: %if.then
+; CHECK-PERF: %if.else
+; CHECK-PERF: %if.end
+; CHECK-PERF: %end
+;
+; CHECK-SIZE-LABEL: func_loop:
+; CHECK-SIZE: %entry
+; CHECK-SIZE: %header
+; CHECK-SIZE: %if.else
+; CHECK-SIZE: %if.end
+; CHECK-SIZE: %if.then
+; CHECK-SIZE: %end
+
+entry:
+  br label %header
+
+header:
+  call void @e()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !10
+
+if.then:
+  call void @f()
+  br label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end
+
+end:
+  ret void
+}
+
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @g()
+declare void @f()
+declare void @h()
+
+!1 = !{!"branch_weights", i32 10, i32 10000}
+!9 = !{!"function_entry_count", i64 1}
+!10 = !{!"branch_weights", i32 16, i32 160}