[llvm] [BOLT][X86]Redirect never-taken jumps (PR #113923)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 28 12:08:33 PDT 2024


https://github.com/ShatianWang updated https://github.com/llvm/llvm-project/pull/113923

>From 5e4642274f0360bbc0904feabace8c72c163d8b8 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 9 Oct 2024 14:45:50 -0700
Subject: [PATCH 1/2] [BOLT][X86]Redirect never-taken jumps

---
 bolt/include/bolt/Core/BinaryBasicBlock.h     |  14 +-
 .../bolt/Passes/RedirectNeverTakenJumps.h     |  58 ++
 bolt/lib/Core/BinaryBasicBlock.cpp            |  73 +++
 bolt/lib/Passes/CMakeLists.txt                |   1 +
 bolt/lib/Passes/RedirectNeverTakenJumps.cpp   | 503 ++++++++++++++++++
 bolt/lib/Rewrite/BinaryPassManager.cpp        |   9 +
 bolt/test/X86/redirect-never-taken-jumps.s    |  86 +++
 7 files changed, 742 insertions(+), 2 deletions(-)
 create mode 100644 bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
 create mode 100644 bolt/lib/Passes/RedirectNeverTakenJumps.cpp
 create mode 100644 bolt/test/X86/redirect-never-taken-jumps.s

diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index b4f31cf2bae6f6..df4c3f3a20f23f 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -789,13 +789,23 @@ class BinaryBasicBlock {
     return SplitInst;
   }
 
-  /// Split basic block at the instruction pointed to by II.
+  /// Split basic block at the instruction pointed to by II that is
+  /// not after any branch instructions in the basic block.
   /// All iterators pointing after II get invalidated.
   ///
   /// Return the new basic block that starts with the instruction
-  /// at the split point.
+  /// at the split point, which has been inserted at the end of the
+  /// current function.
   BinaryBasicBlock *splitAt(iterator II);
 
+  /// Split basic block in place at the instruction pointed to by II.
+  /// All iterators pointing after II get invalidated.
+  ///
+  /// Return the new basic block that starts with the instruction
+  /// at the split point, which has been inserted right after the
+  /// current basic block in the current function.
+  BinaryBasicBlock *splitInPlaceAt(iterator II);
+
   /// Set start offset of this basic block in the input binary.
   void setOffset(uint32_t Offset) { InputRange.first = Offset; };
 
diff --git a/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h b/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
new file mode 100644
index 00000000000000..d85eb5fb0fdf35
--- /dev/null
+++ b/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
@@ -0,0 +1,58 @@
+//===- bolt/Passes/RedirectNeverTakenJumps.h - Code size reduction --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reduces code size in X86 by redirecting never-taken jumps that take
+// 5 or 6 bytes to nearby jumps with the same jump target and compatible
+// condition codes. Doing each such redirection will save 3 or 4 bytes depending
+// on if the redirected jump is unconditional or conditional, since a short jump
+// takes only 2 bytes. The pass can be turned on with BOLT option
+// -redirect-never-taken-jumps.
+//
+// There are two modes for classifying "never-taken" jumps: aggressive and
+// conservative. The aggressive mode classifies any jump with zero execution
+// count as never-taken, and can be turned on with BOLT option
+// -aggressive-never-taken. The conservative mode is used by default and
+// accounts for potential errors in the input profile. It infers if a jump with
+// zero execution count is actually never-taken by checking the gap between the
+// inflow (resp. outflow) and block execution count for each basic block.
+// The conservativeness is controlled by BOLT option
+// -conservative-never-taken-threshold. The smaller the threshold, the more
+// conservative the classification is. In most realistic settings, the value
+// should exceed 1.0. The current default is 1.25.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_PASSES_REDIRECT_NEVER_TAKEN_JUMPS_H
+#define BOLT_PASSES_REDIRECT_NEVER_TAKEN_JUMPS_H
+
+#include "bolt/Passes/BinaryPasses.h"
+#include <atomic>
+
+namespace llvm {
+namespace bolt {
+
+class RedirectNeverTakenJumps : public BinaryFunctionPass {
+private:
+  std::atomic<uint64_t> TotalHotSizeSavings{0ull};
+  std::atomic<uint64_t> TotalSizeSavings{0ull};
+
+public:
+  explicit RedirectNeverTakenJumps(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "redirect-never-taken-jumps"; }
+
+  Error runOnFunctions(BinaryContext &BC) override;
+
+  void performRedirections(BinaryFunction &Function);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 2a2192b79bb4bf..4e87865f6a210f 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -572,5 +572,78 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
   return NewBlock;
 }
 
+BinaryBasicBlock *BinaryBasicBlock::splitInPlaceAt(iterator II) {
+  assert(II != end() && "expected iterator pointing to instruction");
+  if (II == begin())
+    return this;
+  const BinaryContext &BC = Function->getBinaryContext();
+  std::vector<std::unique_ptr<BinaryBasicBlock>> ToAdd;
+  ToAdd.emplace_back(getFunction()->createBasicBlock());
+  BinaryBasicBlock *BBNew = ToAdd.back().get();
+  uint64_t BBNewExecCount = 0;
+
+  // Find successors of the current block that needs to be moved.
+  BinaryBasicBlock *CondSuccessor = nullptr;
+  BinaryBasicBlock::BinaryBranchInfo CondSuccessorBI;
+  BinaryBasicBlock *UncondSuccessor = nullptr;
+  BinaryBasicBlock::BinaryBranchInfo UncondSuccessorBI;
+  auto I = end();
+  while (I != II) {
+    --I;
+    if (BC.MIB->isUnconditionalBranch(*I)) {
+      const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(*I);
+      UncondSuccessor = getSuccessor(TargetSymbol, UncondSuccessorBI);
+    } else if (BC.MIB->isConditionalBranch(*I)) {
+      const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(*I);
+      CondSuccessor = getSuccessor(TargetSymbol, CondSuccessorBI);
+    }
+  }
+
+  // Adjust successors of the current and the new blocks.
+  if (CondSuccessor != nullptr) {
+    BBNew->addSuccessor(CondSuccessor, CondSuccessorBI);
+    BBNewExecCount +=
+        CondSuccessorBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE
+            ? CondSuccessorBI.Count
+            : 0;
+    removeSuccessor(CondSuccessor);
+  }
+  if (UncondSuccessor != nullptr) {
+    BBNew->addSuccessor(UncondSuccessor, UncondSuccessorBI);
+    BBNewExecCount +=
+        UncondSuccessorBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE
+            ? UncondSuccessorBI.Count
+            : 0;
+    removeSuccessor(UncondSuccessor);
+  } else { // Fall through.
+    BinaryBasicBlock *NextBB =
+        Function->getLayout().getBasicBlockAfter(this, /*IgnoreSplits=*/false);
+    assert(NextBB);
+    if (getSuccessor(NextBB->getLabel())) {
+      const BinaryBasicBlock::BinaryBranchInfo &BI = getBranchInfo(*NextBB);
+      BBNew->addSuccessor(NextBB, BI);
+      BBNewExecCount +=
+          BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE ? BI.Count : 0;
+      removeSuccessor(NextBB);
+    }
+  }
+  addSuccessor(BBNew, BBNewExecCount, 0);
+  BBNew->setExecutionCount(BBNewExecCount);
+
+  // Set correct CFI state for the new block.
+  BBNew->setCFIState(getCFIStateAtInstr(&*II));
+
+  // Move instructions over.
+  adjustNumPseudos(II, end(), -1);
+  BBNew->addInstructions(II, end());
+  Instructions.erase(II, end());
+
+  // Insert new block after the current block.
+  getFunction()->insertBasicBlocks(
+      this, std::move(ToAdd), /*UpdateLayout*/ true, /*UpdateCFIState*/ true,
+      /*RecomputeLandingPads*/ false);
+  return BBNew;
+}
+
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 1c1273b3d2420d..1b64d3e1d0b9e9 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_library(LLVMBOLTPasses
   PettisAndHansen.cpp
   PLTCall.cpp
   ContinuityStats.cpp
+  RedirectNeverTakenJumps.cpp
   RegAnalysis.cpp
   RegReAssign.cpp
   ReorderAlgorithm.cpp
diff --git a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
new file mode 100644
index 00000000000000..418f055beff306
--- /dev/null
+++ b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
@@ -0,0 +1,503 @@
+//===- bolt/Passes/RedirectNeverTakenJumps.cpp - Code size reduction ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements RedirectNeverTakenJumps class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/RedirectNeverTakenJumps.h"
+#include "bolt/Core/ParallelUtilities.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+
+static cl::opt<bool> RedirectNeverTakenJumps(
+    "redirect-never-taken-jumps",
+    cl::desc("Apply a heuristic to redirect never taken jumps in order to "
+             "reduce hot code size (X86 only)"),
+    cl::Hidden, cl::init(false), cl::cat(BoltOptCategory));
+
+static cl::opt<bool> AggressiveNeverTaken(
+    "aggressive-never-taken",
+    cl::desc("Classify all zero-execution-count jumps as never taken. This "
+             "option ignores the possibility of execution counts of hot jumps "
+             "being incorrectly set to 0 in the input profile"),
+    cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory));
+
+static cl::opt<double> ConservativeNeverTakenThreshold(
+    "conservative-never-taken-threshold",
+    cl::desc(
+        "When aggressive-never-taken=0 (default), this value controls how "
+        "conservative the classification of never-taken jumps is. The smaller "
+        "the value the more conservative the classification. In most realistic "
+        "settings, the value should exceed 1.0. Default 1.25."),
+    cl::ZeroOrMore, cl::init(1.25), cl::ReallyHidden, cl::cat(BoltOptCategory));
+} // namespace opts
+
+namespace {
+/// A jump instruction in the binary.
+struct JumpT {
+  JumpT(const JumpT &) = delete;
+  JumpT(JumpT &&) = default;
+  JumpT &operator=(const JumpT &) = delete;
+  JumpT &operator=(JumpT &&) = default;
+
+  explicit JumpT(MCInst *Inst, unsigned CC, bool IsUnconditional,
+                 BinaryBasicBlock *OriginalTargetBB, uint64_t ExecutionCount,
+                 BinaryBasicBlock *HomeBB, uint64_t OriginalAddress,
+                 uint64_t OriginalInstrSize)
+      : Inst(Inst), CC(CC), IsUnconditional(IsUnconditional),
+        OriginalTargetBB(OriginalTargetBB), ExecutionCount(ExecutionCount),
+        HomeBB(HomeBB), OriginalAddress(OriginalAddress),
+        OriginalInstrSize(OriginalInstrSize) {}
+
+  MCInst *Inst;
+  unsigned CC;
+  bool IsUnconditional;
+  BinaryBasicBlock *OriginalTargetBB;
+  uint64_t ExecutionCount;
+  BinaryBasicBlock *HomeBB;
+  uint64_t OriginalAddress{0};
+  uint8_t OriginalInstrSize{0};
+
+  bool IsLongNeverTaken{false};
+  bool IsRedirectionTarget{false};
+  JumpT *RedirectionTarget{nullptr};
+  JumpT *UncondJumpInSameBlock{nullptr};
+};
+
+using Jumps = std::vector<std::unique_ptr<JumpT>>;
+using JumpPtrs = std::vector<JumpT *>;
+using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
+using BlockToJumpsMapTy =
+    std::unordered_map<BinaryBasicBlock *, std::vector<JumpT *>>;
+
+/// Size of jump instructions in bytes in X86.
+static constexpr uint8_t ShortJumpSize = 2;
+static constexpr uint8_t LongUncondJumpSize = 5;
+static constexpr uint8_t LongCondJumpSize = 6;
+
+/// The longest distance for any short jump on X86.
+static constexpr uint8_t ShortJumpBits = 8;
+static constexpr uint8_t ShortestJumpSpan = 1ULL << (ShortJumpBits - 1);
+
+bool isLongJump(const uint64_t JumpStartAddr, const uint64_t JumpEndAddr,
+                const bool SameFragment) {
+  if (!SameFragment)
+    return true;
+  if (JumpEndAddr > JumpStartAddr)
+    return JumpEndAddr - JumpStartAddr > ShortestJumpSpan - 1;
+  else
+    return JumpStartAddr - JumpEndAddr > ShortestJumpSpan;
+}
+
+void createJumps(BinaryFunction &Function, FunctionFragment &Fragment,
+                 Jumps &JumpsInFunction, JumpPtrs &JumpsInFragment) {
+  const BinaryContext &BC = Function.getBinaryContext();
+
+  auto createJump = [&](MCInst *Branch, bool IsUnconditional,
+                        BinaryBasicBlock *SourceBB, BinaryBasicBlock *TargetBB,
+                        const uint8_t OffsetFromBlockEnd) {
+    const BinaryBasicBlock::BinaryBranchInfo &BI =
+        SourceBB->getBranchInfo(*TargetBB);
+    uint64_t ExecCount = 0;
+    if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+      ExecCount = BI.Count;
+
+    const uint64_t JumpEndAddr = TargetBB->getOutputStartAddress();
+    const uint64_t JumpStartAddr =
+        SourceBB->getOutputEndAddress() - OffsetFromBlockEnd;
+    const uint8_t LongJumpSize =
+        IsUnconditional ? LongUncondJumpSize : LongCondJumpSize;
+    const uint8_t JumpInstrSize =
+        isLongJump(JumpStartAddr, JumpEndAddr,
+                   SourceBB->getFragmentNum() == TargetBB->getFragmentNum())
+            ? LongJumpSize
+            : ShortJumpSize;
+    return std::unique_ptr<JumpT>(new JumpT(
+        Branch, BC.MIB->getCondCode(*Branch), IsUnconditional, TargetBB,
+        ExecCount, SourceBB, JumpStartAddr - JumpInstrSize, JumpInstrSize));
+  };
+
+  for (BinaryBasicBlock *BB : Fragment) {
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    BinaryBasicBlock *CondSuccessor = nullptr;
+    BinaryBasicBlock *UncondSuccessor = nullptr;
+
+    if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+      if (BB->succ_size() == 1) {
+        UncondSuccessor = BB->getSuccessor();
+        if (UncondBranch != nullptr) {
+          std::unique_ptr<JumpT> Jump =
+              createJump(UncondBranch, true, BB, UncondSuccessor, 0);
+          JumpsInFragment.push_back(Jump.get());
+          JumpsInFunction.push_back(std::move(Jump));
+        }
+      } else if (BB->succ_size() == 2) {
+        assert(CondBranch != nullptr);
+        CondSuccessor = BB->getConditionalSuccessor(true);
+        UncondSuccessor = BB->getConditionalSuccessor(false);
+        std::unique_ptr<JumpT> UncondJump = nullptr;
+        std::unique_ptr<JumpT> CondJump = nullptr;
+        uint8_t UncondJumpInstrSize = 0;
+        if (UncondBranch != nullptr) {
+          UncondJump = createJump(UncondBranch, true, BB, UncondSuccessor, 0);
+          UncondJumpInstrSize = UncondJump->OriginalInstrSize;
+        }
+        if (!BC.MIB->isDynamicBranch(*CondBranch)) {
+          CondJump = createJump(CondBranch, false, BB, CondSuccessor,
+                                UncondJumpInstrSize);
+          if (UncondJump != nullptr)
+            CondJump->UncondJumpInSameBlock = UncondJump.get();
+        }
+        if (CondJump != nullptr) {
+          JumpsInFragment.push_back(CondJump.get());
+          JumpsInFunction.push_back(std::move(CondJump));
+        }
+        if (UncondJump != nullptr) {
+          JumpsInFragment.push_back(UncondJump.get());
+          JumpsInFunction.push_back(std::move(UncondJump));
+        }
+      }
+    }
+  }
+}
+
+void identifyCandidates(BinaryFunction &Function, JumpPtrs &JumpsInFragment,
+                        BlockToJumpsMapTy &TargetsToJumps) {
+  // Identify jumps that are long and never taken.
+  // First check if each jump is long and have zero execution count.
+  auto isLongZeroCount = [&](const JumpT &Jump) {
+    return Jump.ExecutionCount == 0 && Jump.OriginalInstrSize > ShortJumpSize;
+    ;
+  };
+
+  BlockToJumpsMapTy SourcesToJumps;
+  for (JumpT *Jump : JumpsInFragment) {
+    Jump->IsLongNeverTaken = isLongZeroCount(*Jump);
+    assert(Jump->OriginalTargetBB != nullptr);
+    TargetsToJumps[Jump->OriginalTargetBB].push_back(Jump);
+    SourcesToJumps[Jump->HomeBB].push_back(Jump);
+  }
+
+  // Next identify zero-execution-count jumps that are unlikely to actually be
+  // never-taken by comparing the value of inflow (resp outflow) of each basic
+  // block with its block execution count.
+  FlowMapTy IncomingMap;
+  FlowMapTy OutgoingMap;
+  for (const BinaryBasicBlock &BB : Function) {
+    auto SuccBIIter = BB.branch_info_begin();
+    for (BinaryBasicBlock *Succ : BB.successors()) {
+      const uint64_t Count = SuccBIIter->Count;
+      if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+        ++SuccBIIter;
+        continue;
+      }
+      IncomingMap[Succ] += Count;
+      OutgoingMap[&BB] += Count;
+      ++SuccBIIter;
+    }
+  }
+
+  if (!opts::AggressiveNeverTaken) {
+    for (auto &TargetToJumps : TargetsToJumps) {
+      const BinaryBasicBlock *TargetBB = TargetToJumps.first;
+      if (TargetBB->getKnownExecutionCount() == 0)
+        continue;
+      const uint64_t IncomingCount = IncomingMap[TargetBB];
+      // If there is a noticeable gap between the incoming edge count and the BB
+      // execution count, then we don't want to trust the 0 execution count
+      // edges as actually 0 execution count.
+      if (IncomingCount * opts::ConservativeNeverTakenThreshold <
+          TargetBB->getKnownExecutionCount()) {
+        for (JumpT *Jump : TargetToJumps.second) {
+          Jump->IsLongNeverTaken = false;
+        }
+      }
+    }
+
+    for (auto &SourceToJumps : SourcesToJumps) {
+      const BinaryBasicBlock *SourceBB = SourceToJumps.first;
+      if (SourceBB->getKnownExecutionCount() == 0)
+        continue;
+      const uint64_t OutgoingCount = OutgoingMap[SourceBB];
+      // If there is a noticeable gap between the outgoing edge count and the BB
+      // execution count, then we don't want to trust the 0 execution count
+      // edges as actually 0 execution count.
+
+      if (OutgoingCount * opts::ConservativeNeverTakenThreshold <
+          SourceBB->getKnownExecutionCount()) {
+        for (JumpT *Jump : SourceToJumps.second) {
+          Jump->IsLongNeverTaken = false;
+        }
+      }
+    }
+  }
+}
+
+uint64_t makeRedirectionDecisions(BlockToJumpsMapTy &TargetsToJumps) {
+  uint64_t NumRedirected = 0;
+  for (auto &TargetToJumps : TargetsToJumps) {
+    std::vector<JumpT *> &Jumps = TargetToJumps.second;
+    if (Jumps.size() <= 1)
+      continue;
+    std::unordered_map<unsigned, JumpT *> MostRecentCondJumps;
+    JumpT *MostRecentUncondJump = nullptr;
+
+    // Round 1: redirect jumps to the closest candidate to its right.
+    for (auto JumpItr = Jumps.rbegin(); JumpItr != Jumps.rend(); ++JumpItr) {
+      JumpT *CurrJump = *JumpItr;
+      if (CurrJump->IsLongNeverTaken) {
+        // Check if we can redirect CurrJump to MostRecentUncondJump.
+        if (MostRecentUncondJump != nullptr) {
+          if (!isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                          MostRecentUncondJump->OriginalAddress, true)) {
+            // Redirect CurrJump to MostRecentUncondJump if the latter is close
+            // enough.
+            CurrJump->RedirectionTarget = MostRecentUncondJump;
+            MostRecentUncondJump->IsRedirectionTarget = true;
+            NumRedirected++;
+          } else if (!CurrJump->IsUnconditional) {
+            // Otherwise, try to redirect CurrJump to the most recent
+            // conditional jump with the same conditional code.
+            JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+            if (MostRecentCondJump != nullptr &&
+                !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                            MostRecentCondJump->OriginalAddress, true)) {
+              CurrJump->RedirectionTarget = MostRecentCondJump;
+              MostRecentCondJump->IsRedirectionTarget = true;
+              NumRedirected++;
+            }
+          }
+        } else if (!CurrJump->IsUnconditional) {
+          // If MostRecentUncondJump does not exist and CurrJump is conditional,
+          // try to redirect CurrJump to the most recent conditional jump with
+          // the same conditional code
+          JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+          if (MostRecentCondJump != nullptr &&
+              !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                          MostRecentCondJump->OriginalAddress, true)) {
+            CurrJump->RedirectionTarget = MostRecentCondJump;
+            MostRecentCondJump->IsRedirectionTarget = true;
+            NumRedirected++;
+          }
+        }
+      }
+
+      // Update most recent jump by condition.
+      if (CurrJump->IsUnconditional)
+        MostRecentUncondJump = CurrJump;
+      else
+        MostRecentCondJumps[CurrJump->CC] = CurrJump;
+    }
+
+    // Round 2: redirect jumps to the closest candidate to its left while
+    // making shre there are no cyclic redirections.
+    MostRecentCondJumps.clear();
+    MostRecentUncondJump = nullptr;
+    for (auto JumpItr = Jumps.begin(); JumpItr != Jumps.end(); ++JumpItr) {
+      JumpT *CurrJump = *JumpItr;
+      if (CurrJump->IsLongNeverTaken) {
+        if (CurrJump->RedirectionTarget == nullptr) {
+          // Check if we can redirect CurrJump to MostRecentUncondJump.
+          if (MostRecentUncondJump != nullptr) {
+            if (!isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                            MostRecentUncondJump->OriginalAddress, true)) {
+              // Redirect CurrJump to MostRecentUncondJump if the latter is
+              // close enough.
+              CurrJump->RedirectionTarget = MostRecentUncondJump;
+              MostRecentUncondJump->IsRedirectionTarget = true;
+              NumRedirected++;
+            } else if (!CurrJump->IsUnconditional) {
+              // Otherwise, try to redirect CurrJump to the most recent
+              // conditional jump with the same conditional code.
+              JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+              if (MostRecentCondJump != nullptr &&
+                  !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                              MostRecentCondJump->OriginalAddress, true)) {
+                CurrJump->RedirectionTarget = MostRecentCondJump;
+                MostRecentCondJump->IsRedirectionTarget = true;
+                NumRedirected++;
+              }
+            }
+          } else if (!CurrJump->IsUnconditional) {
+            // If MostRecentUncondJump does not exist and CurrJump is
+            // conditional, try to redirect CurrJump to the most recent
+            // conditional jump with the same conditional code
+            JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+            if (MostRecentCondJump != nullptr &&
+                !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+                            MostRecentCondJump->OriginalAddress, true)) {
+              CurrJump->RedirectionTarget = MostRecentCondJump;
+              MostRecentCondJump->IsRedirectionTarget = true;
+              NumRedirected++;
+            }
+          }
+        } else {
+          // If CurrJump has already been redirected in round 1, then use
+          // continue to avoid updating MostRecentUncondJump or
+          // MostRecentCondJumps with CurrJump. This will disallow redirection
+          // to jumps that were redirected in round 1 and hence avoid cyclic
+          // redirections.
+          continue;
+        }
+      }
+
+      // Update most recent jump by condition.
+      if (CurrJump->IsUnconditional)
+        MostRecentUncondJump = CurrJump;
+      else
+        MostRecentCondJumps[CurrJump->CC] = CurrJump;
+    }
+  }
+  return NumRedirected;
+}
+
+void checkDecisionCorrectness(Jumps &JumpsInFunction) {
+  // Check correctness of redirection decisions.
+  for (const auto &Jump : JumpsInFunction) {
+    if (Jump->RedirectionTarget != nullptr) {
+      JumpT *CurrJump = Jump.get();
+      JumpT *NextJump = CurrJump->RedirectionTarget;
+      while (NextJump != nullptr) {
+        // No cyclic redirections.
+        assert(NextJump != Jump.get());
+        // Redirect either to unconditional jump or jump with the same
+        // conditional code.
+        assert(NextJump->CC == CurrJump->CC || NextJump->IsUnconditional);
+        CurrJump = NextJump;
+        NextJump = CurrJump->RedirectionTarget;
+      }
+      // Jump will eventually reach its original target.
+      assert(CurrJump->OriginalTargetBB == Jump->OriginalTargetBB);
+    }
+  }
+}
+
+void redirectJumps(BinaryFunction &Function, Jumps &JumpsInFunction) {
+  const BinaryContext &BC = Function.getBinaryContext();
+  // Helper function to split HomeBB at the JumpInst and return the new
+  // basic block that has JumpInst as its first instruction.
+  auto createJumpBlock = [&](JumpT *Jump) {
+    BinaryBasicBlock *HomeBB = Jump->HomeBB;
+    MCInst *Inst = Jump->Inst;
+
+    // Obtain iterator II pointing to Inst.
+    auto II = HomeBB->end();
+    while (&*II != Inst)
+      II--;
+    return HomeBB->splitInPlaceAt(II);
+  };
+
+  // Split basic blocks at jump instructions that are redirection targets.
+  for (auto JumpItr = JumpsInFunction.rbegin();
+       JumpItr != JumpsInFunction.rend(); ++JumpItr) {
+    JumpT *Jump = (*JumpItr).get();
+    if (!Jump->IsRedirectionTarget)
+      continue;
+    BinaryBasicBlock *NewBB = createJumpBlock(Jump);
+    Jump->HomeBB = NewBB;
+    // If the new block contains two instructions, then it means NewBB
+    // contains both a conditional jump (Jump) and an unconditional
+    // jump (Jump->UncondJumpInSameBlock). We also need to update
+    // the HomeBB of the latter.
+    if (NewBB->getNumNonPseudos() == 2) {
+      assert(Jump->UncondJumpInSameBlock != nullptr);
+      Jump->UncondJumpInSameBlock->HomeBB = NewBB;
+    }
+  }
+
+  // Check correctness of splitting.
+  for (const auto &Jump : JumpsInFunction) {
+    if (Jump->IsRedirectionTarget) {
+      MCInst FirstInst = *(Jump->HomeBB->begin());
+      assert(BC.MIB->getCondCode(FirstInst) == Jump->CC);
+      assert(BC.MIB->getTargetSymbol(FirstInst) ==
+             Jump->OriginalTargetBB->getLabel());
+    }
+  }
+
+  // Perform redirections.
+  for (const auto &Jump : JumpsInFunction) {
+    if (Jump->RedirectionTarget != nullptr) {
+      BinaryBasicBlock *HomeBB = Jump->HomeBB;
+      BinaryBasicBlock *OriginalTargetBB = Jump->OriginalTargetBB;
+      BinaryBasicBlock *NewTargetBB = Jump->RedirectionTarget->HomeBB;
+      HomeBB->replaceSuccessor(OriginalTargetBB, NewTargetBB, /*Count=*/0,
+                               /*MispredictedCount=*/0);
+    }
+  }
+}
+} // namespace
+
+void RedirectNeverTakenJumps::performRedirections(BinaryFunction &Function) {
+  BinaryContext &BC = Function.getBinaryContext();
+
+  // Populate BinaryBasicBlock::OutputAddressRange.
+  uint64_t OldHotSize = 0;
+  uint64_t OldColdSize = 0;
+  std::tie(OldHotSize, OldColdSize) =
+      BC.calculateEmittedSize(Function, /*FixBranches=*/true);
+
+  // Perform redirections.
+  Jumps JumpsInFunction;
+  uint64_t NumJumpsToRedirect = 0;
+  for (FunctionFragment &FF : Function.getLayout().fragments()) {
+    JumpPtrs JumpsInFragment;
+    BlockToJumpsMapTy TargetsToJumps;
+    createJumps(Function, FF, JumpsInFunction, JumpsInFragment);
+    identifyCandidates(Function, JumpsInFragment, TargetsToJumps);
+    NumJumpsToRedirect += makeRedirectionDecisions(TargetsToJumps);
+  }
+  if (NumJumpsToRedirect == 0)
+    return;
+
+  checkDecisionCorrectness(JumpsInFunction);
+  redirectJumps(Function, JumpsInFunction);
+
+  // Log size reduction.
+  const auto [NewHotSize, NewColdSize] =
+      BC.calculateEmittedSize(Function, /*FixBranches*/ true);
+
+  assert(NewHotSize <= OldHotSize);
+  assert(NewColdSize <= OldColdSize);
+
+  TotalSizeSavings += OldHotSize - NewHotSize + OldColdSize - NewColdSize;
+  if (Function.hasValidIndex())
+    TotalHotSizeSavings += OldHotSize - NewHotSize;
+  return;
+}
+
+Error RedirectNeverTakenJumps::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return Error::success();
+
+  if (opts::RedirectNeverTakenJumps) {
+    ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+      return !shouldOptimize(BF);
+    };
+
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
+        [&](BinaryFunction &BF) { performRedirections(BF); }, SkipFunc,
+        "RedirectNeverTakenJumps", /*ForceSequential*/ false);
+
+    BC.outs() << format(
+        "BOLT-INFO: redirection of never-taken jumps saved %zu bytes hot "
+        "section code size and %zu bytes total code size\n",
+        TotalHotSizeSavings.load(), TotalSizeSavings.load());
+  }
+
+  return Error::success();
+}
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index b0906041833484..93f0ef482997c8 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -27,6 +27,7 @@
 #include "bolt/Passes/MCF.h"
 #include "bolt/Passes/PLTCall.h"
 #include "bolt/Passes/PatchEntries.h"
+#include "bolt/Passes/RedirectNeverTakenJumps.h"
 #include "bolt/Passes/RegReAssign.h"
 #include "bolt/Passes/ReorderData.h"
 #include "bolt/Passes/ReorderFunctions.h"
@@ -146,6 +147,11 @@ static cl::opt<bool>
                    cl::desc("print functions after peephole optimization"),
                    cl::Hidden, cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintRedirectNeverTaken(
+    "print-redirect-never-taken",
+    cl::desc("print functions after redirecting never taken jumps"), cl::Hidden,
+    cl::cat(BoltOptCategory));
+
 static cl::opt<bool>
     PrintPLT("print-plt", cl::desc("print functions after PLT optimization"),
              cl::Hidden, cl::cat(BoltOptCategory));
@@ -462,6 +468,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   // splitting.
   Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
 
+  Manager.registerPass(
+      std::make_unique<RedirectNeverTakenJumps>(PrintRedirectNeverTaken));
+
   // Print final dyno stats right while CFG and instruction analysis are intact.
   Manager.registerPass(std::make_unique<DynoStatsPrintPass>(
                            "after all optimizations before SCTC and FOP"),
diff --git a/bolt/test/X86/redirect-never-taken-jumps.s b/bolt/test/X86/redirect-never-taken-jumps.s
new file mode 100644
index 00000000000000..5701383cc1ac55
--- /dev/null
+++ b/bolt/test/X86/redirect-never-taken-jumps.s
@@ -0,0 +1,86 @@
+## Test redirect-never-taken-jumps
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata1 FDATA1
+# RUN: link_fdata %s %t.o %t.fdata2 FDATA2
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN:         --redirect-never-taken-jumps --print-redirect-never-taken --data=%t.fdata1 \
+# RUN:     2>&1 | FileCheck --check-prefix=CHECK_REGULAR %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN:         --redirect-never-taken-jumps --data=%t.fdata2 \
+# RUN:     2>&1 | FileCheck --check-prefix=CHECK_CONSERVATIVE_DEFAULT %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN:         --redirect-never-taken-jumps --conservative-never-taken-threshold=2.0 --data=%t.fdata2 \
+# RUN:     2>&1 | FileCheck --check-prefix=CHECK_CONSERVATIVE_THRESHOLD %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN:         --redirect-never-taken-jumps --aggressive-never-taken --data=%t.fdata2 \
+# RUN:     2>&1 | FileCheck --check-prefix=CHECK_AGGRRESSIVE %s
+
+# CHECK_REGULAR: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+# CHECK_REGULAR: .LBB00 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB2 (mispreds: 0, count: 0), .Ltmp2 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .Ltmp2 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB2 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .LBB2 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp0 (mispreds: 0, count: 0), .LFT0 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .Ltmp3 (1 instructions, align : 1)
+# CHECK_REGULAR: HOT-COLD SPLIT POINT
+# CHECK_REGULAR: .Ltmp0 (2 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB1 (mispreds: 0, count: 0), .LBB00 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .Ltmp4 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB1 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .LBB1 (2 instructions, align : 1)
+# CHECk_REGULAR: Successors: .LBB0 (mispreds: 0, count: 0), .Ltmp2 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .Ltmp1 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp4 (mispreds: 0, count: 0), .LBB0 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .LBB0 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp3 (mispreds: 0, count: 0)
+
+# CHECK_CONSERVATIVE_DEFAULT: redirection of never-taken jumps saved 4 bytes hot section code size and 8 bytes total code size
+# CHECK_CONSERVATIVE_THRESHOLD: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+# CHECK_AGGRRESSIVE: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+
+
+        .globl  main
+        .type   main, @function
+main:
+LBB00:
+        ja Ltmp0
+# FDATA1: 1 main #LBB00# 1 main #Ltmp0# 0 0
+# FDATA1: 1 main #LBB00# 1 main #Ltmp2# 0 20
+# FDATA2: 1 main #LBB00# 1 main #Ltmp0# 0 0
+# FDATA2: 1 main #LBB00# 1 main #Ltmp2# 0 10
+Ltmp2:
+        cmpl $0,%eax
+Ltmp2Jcc:
+        ja Ltmp0
+# FDATA1: 1 main #Ltmp2Jcc# 1 main #Ltmp0# 0 0
+# FDATA1: 1 main #Ltmp2Jcc# 1 main #LFT0# 0 20
+# FDATA2: 1 main #Ltmp2Jcc# 1 main #Ltmp0# 0 0
+# FDATA2: 1 main #Ltmp2Jcc# 1 main #LFT0# 0 20
+LFT0:
+        ja Ltmp1
+# FDATA1: 1 main #LFT0# 1 main #Ltmp1# 0 0
+# FDATA1: 1 main #LFT0# 1 main #Ltmp3# 0 20
+# FDATA2: 1 main #LFT0# 1 main #Ltmp1# 0 0
+# FDATA2: 1 main #LFT0# 1 main #Ltmp3# 0 20
+Ltmp3:
+        ret
+Ltmp0:
+        jae Ltmp2
+        jmp LBB00
+Ltmp4:
+        cmpl $0,%eax
+        jae Ltmp2
+        jmp Ltmp3
+Ltmp1:
+        je Ltmp4
+        jmp Ltmp3
+.LLmain_end:
+        .size   main, .LLmain_end-main

>From 17f47b2d3cb61d935a0f83d5bc54dc98b36a231e Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 28 Oct 2024 12:07:43 -0700
Subject: [PATCH 2/2] fixup! [BOLT][X86]Redirect never-taken jumps

---
 bolt/lib/Passes/RedirectNeverTakenJumps.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
index 418f055beff306..65bdc6637bbca8 100644
--- a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
+++ b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
@@ -112,9 +112,9 @@ void createJumps(BinaryFunction &Function, FunctionFragment &Fragment,
     if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE)
       ExecCount = BI.Count;
 
-    const uint64_t JumpEndAddr = TargetBB->getOutputStartAddress();
+    const uint64_t JumpEndAddr = TargetBB->getOutputAddressRange().first;
     const uint64_t JumpStartAddr =
-        SourceBB->getOutputEndAddress() - OffsetFromBlockEnd;
+        SourceBB->getOutputAddressRange().second - OffsetFromBlockEnd;
     const uint8_t LongJumpSize =
         IsUnconditional ? LongUncondJumpSize : LongCondJumpSize;
     const uint8_t JumpInstrSize =



More information about the llvm-commits mailing list