[llvm] [BOLT][X86]Redirect never-taken jumps (PR #113923)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 28 12:08:33 PDT 2024
https://github.com/ShatianWang updated https://github.com/llvm/llvm-project/pull/113923
>From 5e4642274f0360bbc0904feabace8c72c163d8b8 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 9 Oct 2024 14:45:50 -0700
Subject: [PATCH 1/2] [BOLT][X86]Redirect never-taken jumps
---
bolt/include/bolt/Core/BinaryBasicBlock.h | 14 +-
.../bolt/Passes/RedirectNeverTakenJumps.h | 58 ++
bolt/lib/Core/BinaryBasicBlock.cpp | 73 +++
bolt/lib/Passes/CMakeLists.txt | 1 +
bolt/lib/Passes/RedirectNeverTakenJumps.cpp | 503 ++++++++++++++++++
bolt/lib/Rewrite/BinaryPassManager.cpp | 9 +
bolt/test/X86/redirect-never-taken-jumps.s | 86 +++
7 files changed, 742 insertions(+), 2 deletions(-)
create mode 100644 bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
create mode 100644 bolt/lib/Passes/RedirectNeverTakenJumps.cpp
create mode 100644 bolt/test/X86/redirect-never-taken-jumps.s
diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index b4f31cf2bae6f6..df4c3f3a20f23f 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -789,13 +789,23 @@ class BinaryBasicBlock {
return SplitInst;
}
- /// Split basic block at the instruction pointed to by II.
+ /// Split basic block at the instruction pointed to by II that is
+ /// not after any branch instructions in the basic block.
/// All iterators pointing after II get invalidated.
///
/// Return the new basic block that starts with the instruction
- /// at the split point.
+ /// at the split point, which has been inserted at the end of the
+ /// current function.
BinaryBasicBlock *splitAt(iterator II);
+ /// Split basic block in place at the instruction pointed to by II.
+ /// All iterators pointing after II get invalidated.
+ ///
+ /// Return the new basic block that starts with the instruction
+ /// at the split point, which has been inserted right after the
+ /// current basic block in the current function.
+ BinaryBasicBlock *splitInPlaceAt(iterator II);
+
/// Set start offset of this basic block in the input binary.
void setOffset(uint32_t Offset) { InputRange.first = Offset; };
diff --git a/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h b/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
new file mode 100644
index 00000000000000..d85eb5fb0fdf35
--- /dev/null
+++ b/bolt/include/bolt/Passes/RedirectNeverTakenJumps.h
@@ -0,0 +1,58 @@
+//===- bolt/Passes/RedirectNeverTakenJumps.h - Code size reduction --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reduces code size in X86 by redirecting never-taken jumps that take
+// 5 or 6 bytes to nearby jumps with the same jump target and compatible
+// condition codes. Doing each such redirection will save 3 or 4 bytes depending
+// on if the redirected jump is unconditional or conditional, since a short jump
+// takes only 2 bytes. The pass can be turned on with BOLT option
+// -redirect-never-taken-jumps.
+//
+// There are two modes for classifying "never-taken" jumps: aggressive and
+// conservative. The aggressive mode classifies any jump with zero execution
+// count as never-taken, and can be turned on with BOLT option
+// -aggressive-never-taken. The conservative mode is used by default and
+// accounts for potential errors in the input profile. It infers if a jump with
+// zero execution count is actually never-taken by checking the gap between the
+// inflow (resp. outflow) and block execution count for each basic block.
+// The conservativeness is controlled by BOLT option
+// -conservative-never-taken-threshold. The smaller the threshold, the more
+// conservative the classification is. In most realistic settings, the value
+// should exceed 1.0. The current default is 1.25.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_PASSES_REDIRECT_NEVER_TAKEN_JUMPS_H
+#define BOLT_PASSES_REDIRECT_NEVER_TAKEN_JUMPS_H
+
+#include "bolt/Passes/BinaryPasses.h"
+#include <atomic>
+
+namespace llvm {
+namespace bolt {
+
+class RedirectNeverTakenJumps : public BinaryFunctionPass {
+private:
+ std::atomic<uint64_t> TotalHotSizeSavings{0ull};
+ std::atomic<uint64_t> TotalSizeSavings{0ull};
+
+public:
+ explicit RedirectNeverTakenJumps(const cl::opt<bool> &PrintPass)
+ : BinaryFunctionPass(PrintPass) {}
+
+ const char *getName() const override { return "redirect-never-taken-jumps"; }
+
+ Error runOnFunctions(BinaryContext &BC) override;
+
+ void performRedirections(BinaryFunction &Function);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 2a2192b79bb4bf..4e87865f6a210f 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -572,5 +572,78 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
return NewBlock;
}
+BinaryBasicBlock *BinaryBasicBlock::splitInPlaceAt(iterator II) {
+ assert(II != end() && "expected iterator pointing to instruction");
+ if (II == begin())
+ return this;
+ const BinaryContext &BC = Function->getBinaryContext();
+ std::vector<std::unique_ptr<BinaryBasicBlock>> ToAdd;
+ ToAdd.emplace_back(getFunction()->createBasicBlock());
+ BinaryBasicBlock *BBNew = ToAdd.back().get();
+ uint64_t BBNewExecCount = 0;
+
+ // Find successors of the current block that needs to be moved.
+ BinaryBasicBlock *CondSuccessor = nullptr;
+ BinaryBasicBlock::BinaryBranchInfo CondSuccessorBI;
+ BinaryBasicBlock *UncondSuccessor = nullptr;
+ BinaryBasicBlock::BinaryBranchInfo UncondSuccessorBI;
+ auto I = end();
+ while (I != II) {
+ --I;
+ if (BC.MIB->isUnconditionalBranch(*I)) {
+ const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(*I);
+ UncondSuccessor = getSuccessor(TargetSymbol, UncondSuccessorBI);
+ } else if (BC.MIB->isConditionalBranch(*I)) {
+ const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(*I);
+ CondSuccessor = getSuccessor(TargetSymbol, CondSuccessorBI);
+ }
+ }
+
+ // Adjust successors of the current and the new blocks.
+ if (CondSuccessor != nullptr) {
+ BBNew->addSuccessor(CondSuccessor, CondSuccessorBI);
+ BBNewExecCount +=
+ CondSuccessorBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE
+ ? CondSuccessorBI.Count
+ : 0;
+ removeSuccessor(CondSuccessor);
+ }
+ if (UncondSuccessor != nullptr) {
+ BBNew->addSuccessor(UncondSuccessor, UncondSuccessorBI);
+ BBNewExecCount +=
+ UncondSuccessorBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE
+ ? UncondSuccessorBI.Count
+ : 0;
+ removeSuccessor(UncondSuccessor);
+ } else { // Fall through.
+ BinaryBasicBlock *NextBB =
+ Function->getLayout().getBasicBlockAfter(this, /*IgnoreSplits=*/false);
+ assert(NextBB);
+ if (getSuccessor(NextBB->getLabel())) {
+ const BinaryBasicBlock::BinaryBranchInfo &BI = getBranchInfo(*NextBB);
+ BBNew->addSuccessor(NextBB, BI);
+ BBNewExecCount +=
+ BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE ? BI.Count : 0;
+ removeSuccessor(NextBB);
+ }
+ }
+ addSuccessor(BBNew, BBNewExecCount, 0);
+ BBNew->setExecutionCount(BBNewExecCount);
+
+ // Set correct CFI state for the new block.
+ BBNew->setCFIState(getCFIStateAtInstr(&*II));
+
+ // Move instructions over.
+ adjustNumPseudos(II, end(), -1);
+ BBNew->addInstructions(II, end());
+ Instructions.erase(II, end());
+
+ // Insert new block after the current block.
+ getFunction()->insertBasicBlocks(
+ this, std::move(ToAdd), /*UpdateLayout*/ true, /*UpdateCFIState*/ true,
+ /*RecomputeLandingPads*/ false);
+ return BBNew;
+}
+
} // namespace bolt
} // namespace llvm
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 1c1273b3d2420d..1b64d3e1d0b9e9 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_library(LLVMBOLTPasses
PettisAndHansen.cpp
PLTCall.cpp
ContinuityStats.cpp
+ RedirectNeverTakenJumps.cpp
RegAnalysis.cpp
RegReAssign.cpp
ReorderAlgorithm.cpp
diff --git a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
new file mode 100644
index 00000000000000..418f055beff306
--- /dev/null
+++ b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
@@ -0,0 +1,503 @@
+//===- bolt/Passes/RedirectNeverTakenJumps.cpp - Code size reduction ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements RedirectNeverTakenJumps class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/RedirectNeverTakenJumps.h"
+#include "bolt/Core/ParallelUtilities.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+
+static cl::opt<bool> RedirectNeverTakenJumps(
+ "redirect-never-taken-jumps",
+ cl::desc("Apply a heuristic to redirect never taken jumps in order to "
+ "reduce hot code size (X86 only)"),
+ cl::Hidden, cl::init(false), cl::cat(BoltOptCategory));
+
+static cl::opt<bool> AggressiveNeverTaken(
+ "aggressive-never-taken",
+ cl::desc("Classify all zero-execution-count jumps as never taken. This "
+ "option ignores the possibility of execution counts of hot jumps "
+ "being incorrectly set to 0 in the input profile"),
+ cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory));
+
+static cl::opt<double> ConservativeNeverTakenThreshold(
+ "conservative-never-taken-threshold",
+ cl::desc(
+ "When aggressive-never-taken=0 (default), this value controls how "
+ "conservative the classification of never-taken jumps is. The smaller "
+ "the value the more conservative the classification. In most realistic "
+ "settings, the value should exceed 1.0. Default 1.25."),
+ cl::ZeroOrMore, cl::init(1.25), cl::ReallyHidden, cl::cat(BoltOptCategory));
+} // namespace opts
+
+namespace {
+/// A jump instruction in the binary.
+struct JumpT {
+ JumpT(const JumpT &) = delete;
+ JumpT(JumpT &&) = default;
+ JumpT &operator=(const JumpT &) = delete;
+ JumpT &operator=(JumpT &&) = default;
+
+ explicit JumpT(MCInst *Inst, unsigned CC, bool IsUnconditional,
+ BinaryBasicBlock *OriginalTargetBB, uint64_t ExecutionCount,
+ BinaryBasicBlock *HomeBB, uint64_t OriginalAddress,
+ uint64_t OriginalInstrSize)
+ : Inst(Inst), CC(CC), IsUnconditional(IsUnconditional),
+ OriginalTargetBB(OriginalTargetBB), ExecutionCount(ExecutionCount),
+ HomeBB(HomeBB), OriginalAddress(OriginalAddress),
+ OriginalInstrSize(OriginalInstrSize) {}
+
+ MCInst *Inst;
+ unsigned CC;
+ bool IsUnconditional;
+ BinaryBasicBlock *OriginalTargetBB;
+ uint64_t ExecutionCount;
+ BinaryBasicBlock *HomeBB;
+ uint64_t OriginalAddress{0};
+ uint8_t OriginalInstrSize{0};
+
+ bool IsLongNeverTaken{false};
+ bool IsRedirectionTarget{false};
+ JumpT *RedirectionTarget{nullptr};
+ JumpT *UncondJumpInSameBlock{nullptr};
+};
+
+using Jumps = std::vector<std::unique_ptr<JumpT>>;
+using JumpPtrs = std::vector<JumpT *>;
+using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
+using BlockToJumpsMapTy =
+ std::unordered_map<BinaryBasicBlock *, std::vector<JumpT *>>;
+
+/// Size of jump instructions in bytes in X86.
+static constexpr uint8_t ShortJumpSize = 2;
+static constexpr uint8_t LongUncondJumpSize = 5;
+static constexpr uint8_t LongCondJumpSize = 6;
+
+/// The longest distance for any short jump on X86.
+static constexpr uint8_t ShortJumpBits = 8;
+static constexpr uint8_t ShortestJumpSpan = 1ULL << (ShortJumpBits - 1);
+
+bool isLongJump(const uint64_t JumpStartAddr, const uint64_t JumpEndAddr,
+ const bool SameFragment) {
+ if (!SameFragment)
+ return true;
+ if (JumpEndAddr > JumpStartAddr)
+ return JumpEndAddr - JumpStartAddr > ShortestJumpSpan - 1;
+ else
+ return JumpStartAddr - JumpEndAddr > ShortestJumpSpan;
+}
+
+void createJumps(BinaryFunction &Function, FunctionFragment &Fragment,
+ Jumps &JumpsInFunction, JumpPtrs &JumpsInFragment) {
+ const BinaryContext &BC = Function.getBinaryContext();
+
+ auto createJump = [&](MCInst *Branch, bool IsUnconditional,
+ BinaryBasicBlock *SourceBB, BinaryBasicBlock *TargetBB,
+ const uint8_t OffsetFromBlockEnd) {
+ const BinaryBasicBlock::BinaryBranchInfo &BI =
+ SourceBB->getBranchInfo(*TargetBB);
+ uint64_t ExecCount = 0;
+ if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+ ExecCount = BI.Count;
+
+ const uint64_t JumpEndAddr = TargetBB->getOutputStartAddress();
+ const uint64_t JumpStartAddr =
+ SourceBB->getOutputEndAddress() - OffsetFromBlockEnd;
+ const uint8_t LongJumpSize =
+ IsUnconditional ? LongUncondJumpSize : LongCondJumpSize;
+ const uint8_t JumpInstrSize =
+ isLongJump(JumpStartAddr, JumpEndAddr,
+ SourceBB->getFragmentNum() == TargetBB->getFragmentNum())
+ ? LongJumpSize
+ : ShortJumpSize;
+ return std::unique_ptr<JumpT>(new JumpT(
+ Branch, BC.MIB->getCondCode(*Branch), IsUnconditional, TargetBB,
+ ExecCount, SourceBB, JumpStartAddr - JumpInstrSize, JumpInstrSize));
+ };
+
+ for (BinaryBasicBlock *BB : Fragment) {
+ const MCSymbol *TBB = nullptr;
+ const MCSymbol *FBB = nullptr;
+ MCInst *CondBranch = nullptr;
+ MCInst *UncondBranch = nullptr;
+ BinaryBasicBlock *CondSuccessor = nullptr;
+ BinaryBasicBlock *UncondSuccessor = nullptr;
+
+ if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+ if (BB->succ_size() == 1) {
+ UncondSuccessor = BB->getSuccessor();
+ if (UncondBranch != nullptr) {
+ std::unique_ptr<JumpT> Jump =
+ createJump(UncondBranch, true, BB, UncondSuccessor, 0);
+ JumpsInFragment.push_back(Jump.get());
+ JumpsInFunction.push_back(std::move(Jump));
+ }
+ } else if (BB->succ_size() == 2) {
+ assert(CondBranch != nullptr);
+ CondSuccessor = BB->getConditionalSuccessor(true);
+ UncondSuccessor = BB->getConditionalSuccessor(false);
+ std::unique_ptr<JumpT> UncondJump = nullptr;
+ std::unique_ptr<JumpT> CondJump = nullptr;
+ uint8_t UncondJumpInstrSize = 0;
+ if (UncondBranch != nullptr) {
+ UncondJump = createJump(UncondBranch, true, BB, UncondSuccessor, 0);
+ UncondJumpInstrSize = UncondJump->OriginalInstrSize;
+ }
+ if (!BC.MIB->isDynamicBranch(*CondBranch)) {
+ CondJump = createJump(CondBranch, false, BB, CondSuccessor,
+ UncondJumpInstrSize);
+ if (UncondJump != nullptr)
+ CondJump->UncondJumpInSameBlock = UncondJump.get();
+ }
+ if (CondJump != nullptr) {
+ JumpsInFragment.push_back(CondJump.get());
+ JumpsInFunction.push_back(std::move(CondJump));
+ }
+ if (UncondJump != nullptr) {
+ JumpsInFragment.push_back(UncondJump.get());
+ JumpsInFunction.push_back(std::move(UncondJump));
+ }
+ }
+ }
+ }
+}
+
+void identifyCandidates(BinaryFunction &Function, JumpPtrs &JumpsInFragment,
+ BlockToJumpsMapTy &TargetsToJumps) {
+ // Identify jumps that are long and never taken.
+ // First check if each jump is long and have zero execution count.
+ auto isLongZeroCount = [&](const JumpT &Jump) {
+ return Jump.ExecutionCount == 0 && Jump.OriginalInstrSize > ShortJumpSize;
+ ;
+ };
+
+ BlockToJumpsMapTy SourcesToJumps;
+ for (JumpT *Jump : JumpsInFragment) {
+ Jump->IsLongNeverTaken = isLongZeroCount(*Jump);
+ assert(Jump->OriginalTargetBB != nullptr);
+ TargetsToJumps[Jump->OriginalTargetBB].push_back(Jump);
+ SourcesToJumps[Jump->HomeBB].push_back(Jump);
+ }
+
+ // Next identify zero-execution-count jumps that are unlikely to actually be
+ // never-taken by comparing the value of inflow (resp outflow) of each basic
+ // block with its block execution count.
+ FlowMapTy IncomingMap;
+ FlowMapTy OutgoingMap;
+ for (const BinaryBasicBlock &BB : Function) {
+ auto SuccBIIter = BB.branch_info_begin();
+ for (BinaryBasicBlock *Succ : BB.successors()) {
+ const uint64_t Count = SuccBIIter->Count;
+ if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+ ++SuccBIIter;
+ continue;
+ }
+ IncomingMap[Succ] += Count;
+ OutgoingMap[&BB] += Count;
+ ++SuccBIIter;
+ }
+ }
+
+ if (!opts::AggressiveNeverTaken) {
+ for (auto &TargetToJumps : TargetsToJumps) {
+ const BinaryBasicBlock *TargetBB = TargetToJumps.first;
+ if (TargetBB->getKnownExecutionCount() == 0)
+ continue;
+ const uint64_t IncomingCount = IncomingMap[TargetBB];
+ // If there is a noticeable gap between the incoming edge count and the BB
+ // execution count, then we don't want to trust the 0 execution count
+ // edges as actually 0 execution count.
+ if (IncomingCount * opts::ConservativeNeverTakenThreshold <
+ TargetBB->getKnownExecutionCount()) {
+ for (JumpT *Jump : TargetToJumps.second) {
+ Jump->IsLongNeverTaken = false;
+ }
+ }
+ }
+
+ for (auto &SourceToJumps : SourcesToJumps) {
+ const BinaryBasicBlock *SourceBB = SourceToJumps.first;
+ if (SourceBB->getKnownExecutionCount() == 0)
+ continue;
+ const uint64_t OutgoingCount = OutgoingMap[SourceBB];
+ // If there is a noticeable gap between the outgoing edge count and the BB
+ // execution count, then we don't want to trust the 0 execution count
+ // edges as actually 0 execution count.
+
+ if (OutgoingCount * opts::ConservativeNeverTakenThreshold <
+ SourceBB->getKnownExecutionCount()) {
+ for (JumpT *Jump : SourceToJumps.second) {
+ Jump->IsLongNeverTaken = false;
+ }
+ }
+ }
+ }
+}
+
+uint64_t makeRedirectionDecisions(BlockToJumpsMapTy &TargetsToJumps) {
+ uint64_t NumRedirected = 0;
+ for (auto &TargetToJumps : TargetsToJumps) {
+ std::vector<JumpT *> &Jumps = TargetToJumps.second;
+ if (Jumps.size() <= 1)
+ continue;
+ std::unordered_map<unsigned, JumpT *> MostRecentCondJumps;
+ JumpT *MostRecentUncondJump = nullptr;
+
+ // Round 1: redirect jumps to the closest candidate to its right.
+ for (auto JumpItr = Jumps.rbegin(); JumpItr != Jumps.rend(); ++JumpItr) {
+ JumpT *CurrJump = *JumpItr;
+ if (CurrJump->IsLongNeverTaken) {
+ // Check if we can redirect CurrJump to MostRecentUncondJump.
+ if (MostRecentUncondJump != nullptr) {
+ if (!isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentUncondJump->OriginalAddress, true)) {
+ // Redirect CurrJump to MostRecentUncondJump if the latter is close
+ // enough.
+ CurrJump->RedirectionTarget = MostRecentUncondJump;
+ MostRecentUncondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ } else if (!CurrJump->IsUnconditional) {
+ // Otherwise, try to redirect CurrJump to the most recent
+ // conditional jump with the same conditional code.
+ JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+ if (MostRecentCondJump != nullptr &&
+ !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentCondJump->OriginalAddress, true)) {
+ CurrJump->RedirectionTarget = MostRecentCondJump;
+ MostRecentCondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ }
+ }
+ } else if (!CurrJump->IsUnconditional) {
+ // If MostRecentUncondJump does not exist and CurrJump is conditional,
+ // try to redirect CurrJump to the most recent conditional jump with
+ // the same conditional code
+ JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+ if (MostRecentCondJump != nullptr &&
+ !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentCondJump->OriginalAddress, true)) {
+ CurrJump->RedirectionTarget = MostRecentCondJump;
+ MostRecentCondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ }
+ }
+ }
+
+ // Update most recent jump by condition.
+ if (CurrJump->IsUnconditional)
+ MostRecentUncondJump = CurrJump;
+ else
+ MostRecentCondJumps[CurrJump->CC] = CurrJump;
+ }
+
+ // Round 2: redirect jumps to the closest candidate to its left while
+ // making shre there are no cyclic redirections.
+ MostRecentCondJumps.clear();
+ MostRecentUncondJump = nullptr;
+ for (auto JumpItr = Jumps.begin(); JumpItr != Jumps.end(); ++JumpItr) {
+ JumpT *CurrJump = *JumpItr;
+ if (CurrJump->IsLongNeverTaken) {
+ if (CurrJump->RedirectionTarget == nullptr) {
+ // Check if we can redirect CurrJump to MostRecentUncondJump.
+ if (MostRecentUncondJump != nullptr) {
+ if (!isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentUncondJump->OriginalAddress, true)) {
+ // Redirect CurrJump to MostRecentUncondJump if the latter is
+ // close enough.
+ CurrJump->RedirectionTarget = MostRecentUncondJump;
+ MostRecentUncondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ } else if (!CurrJump->IsUnconditional) {
+ // Otherwise, try to redirect CurrJump to the most recent
+ // conditional jump with the same conditional code.
+ JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+ if (MostRecentCondJump != nullptr &&
+ !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentCondJump->OriginalAddress, true)) {
+ CurrJump->RedirectionTarget = MostRecentCondJump;
+ MostRecentCondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ }
+ }
+ } else if (!CurrJump->IsUnconditional) {
+ // If MostRecentUncondJump does not exist and CurrJump is
+ // conditional, try to redirect CurrJump to the most recent
+ // conditional jump with the same conditional code
+ JumpT *MostRecentCondJump = MostRecentCondJumps[CurrJump->CC];
+ if (MostRecentCondJump != nullptr &&
+ !isLongJump(CurrJump->OriginalAddress + ShortJumpSize,
+ MostRecentCondJump->OriginalAddress, true)) {
+ CurrJump->RedirectionTarget = MostRecentCondJump;
+ MostRecentCondJump->IsRedirectionTarget = true;
+ NumRedirected++;
+ }
+ }
+ } else {
+ // If CurrJump has already been redirected in round 1, then use
+ // continue to avoid updating MostRecentUncondJump or
+ // MostRecentCondJumps with CurrJump. This will disallow redirection
+ // to jumps that were redirected in round 1 and hence avoid cyclic
+ // redirections.
+ continue;
+ }
+ }
+
+ // Update most recent jump by condition.
+ if (CurrJump->IsUnconditional)
+ MostRecentUncondJump = CurrJump;
+ else
+ MostRecentCondJumps[CurrJump->CC] = CurrJump;
+ }
+ }
+ return NumRedirected;
+}
+
+void checkDecisionCorrectness(Jumps &JumpsInFunction) {
+ // Check correctness of redirection decisions.
+ for (const auto &Jump : JumpsInFunction) {
+ if (Jump->RedirectionTarget != nullptr) {
+ JumpT *CurrJump = Jump.get();
+ JumpT *NextJump = CurrJump->RedirectionTarget;
+ while (NextJump != nullptr) {
+ // No cyclic redirections.
+ assert(NextJump != Jump.get());
+ // Redirect either to unconditional jump or jump with the same
+ // conditional code.
+ assert(NextJump->CC == CurrJump->CC || NextJump->IsUnconditional);
+ CurrJump = NextJump;
+ NextJump = CurrJump->RedirectionTarget;
+ }
+ // Jump will eventually reach its original target.
+ assert(CurrJump->OriginalTargetBB == Jump->OriginalTargetBB);
+ }
+ }
+}
+
+void redirectJumps(BinaryFunction &Function, Jumps &JumpsInFunction) {
+ const BinaryContext &BC = Function.getBinaryContext();
+ // Helper function to split HomeBB at the JumpInst and return the new
+ // basic block that has JumpInst as its first instruction.
+ auto createJumpBlock = [&](JumpT *Jump) {
+ BinaryBasicBlock *HomeBB = Jump->HomeBB;
+ MCInst *Inst = Jump->Inst;
+
+ // Obtain iterator II pointing to Inst.
+ auto II = HomeBB->end();
+ while (&*II != Inst)
+ II--;
+ return HomeBB->splitInPlaceAt(II);
+ };
+
+ // Split basic blocks at jump instructions that are redirection targets.
+ for (auto JumpItr = JumpsInFunction.rbegin();
+ JumpItr != JumpsInFunction.rend(); ++JumpItr) {
+ JumpT *Jump = (*JumpItr).get();
+ if (!Jump->IsRedirectionTarget)
+ continue;
+ BinaryBasicBlock *NewBB = createJumpBlock(Jump);
+ Jump->HomeBB = NewBB;
+ // If the new block contains two instructions, then it means NewBB
+ // contains both a conditional jump (Jump) and an unconditional
+ // jump (Jump->UncondJumpInSameBlock). We also need to update
+ // the HomeBB of the latter.
+ if (NewBB->getNumNonPseudos() == 2) {
+ assert(Jump->UncondJumpInSameBlock != nullptr);
+ Jump->UncondJumpInSameBlock->HomeBB = NewBB;
+ }
+ }
+
+ // Check correctness of splitting.
+ for (const auto &Jump : JumpsInFunction) {
+ if (Jump->IsRedirectionTarget) {
+ MCInst FirstInst = *(Jump->HomeBB->begin());
+ assert(BC.MIB->getCondCode(FirstInst) == Jump->CC);
+ assert(BC.MIB->getTargetSymbol(FirstInst) ==
+ Jump->OriginalTargetBB->getLabel());
+ }
+ }
+
+ // Perform redirections.
+ for (const auto &Jump : JumpsInFunction) {
+ if (Jump->RedirectionTarget != nullptr) {
+ BinaryBasicBlock *HomeBB = Jump->HomeBB;
+ BinaryBasicBlock *OriginalTargetBB = Jump->OriginalTargetBB;
+ BinaryBasicBlock *NewTargetBB = Jump->RedirectionTarget->HomeBB;
+ HomeBB->replaceSuccessor(OriginalTargetBB, NewTargetBB, /*Count=*/0,
+ /*MispredictedCount=*/0);
+ }
+ }
+}
+} // namespace
+
+void RedirectNeverTakenJumps::performRedirections(BinaryFunction &Function) {
+ BinaryContext &BC = Function.getBinaryContext();
+
+ // Populate BinaryBasicBlock::OutputAddressRange.
+ uint64_t OldHotSize = 0;
+ uint64_t OldColdSize = 0;
+ std::tie(OldHotSize, OldColdSize) =
+ BC.calculateEmittedSize(Function, /*FixBranches=*/true);
+
+ // Perform redirections.
+ Jumps JumpsInFunction;
+ uint64_t NumJumpsToRedirect = 0;
+ for (FunctionFragment &FF : Function.getLayout().fragments()) {
+ JumpPtrs JumpsInFragment;
+ BlockToJumpsMapTy TargetsToJumps;
+ createJumps(Function, FF, JumpsInFunction, JumpsInFragment);
+ identifyCandidates(Function, JumpsInFragment, TargetsToJumps);
+ NumJumpsToRedirect += makeRedirectionDecisions(TargetsToJumps);
+ }
+ if (NumJumpsToRedirect == 0)
+ return;
+
+ checkDecisionCorrectness(JumpsInFunction);
+ redirectJumps(Function, JumpsInFunction);
+
+ // Log size reduction.
+ const auto [NewHotSize, NewColdSize] =
+ BC.calculateEmittedSize(Function, /*FixBranches*/ true);
+
+ assert(NewHotSize <= OldHotSize);
+ assert(NewColdSize <= OldColdSize);
+
+ TotalSizeSavings += OldHotSize - NewHotSize + OldColdSize - NewColdSize;
+ if (Function.hasValidIndex())
+ TotalHotSizeSavings += OldHotSize - NewHotSize;
+ return;
+}
+
+Error RedirectNeverTakenJumps::runOnFunctions(BinaryContext &BC) {
+ if (!BC.isX86())
+ return Error::success();
+
+ if (opts::RedirectNeverTakenJumps) {
+ ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+ return !shouldOptimize(BF);
+ };
+
+ ParallelUtilities::runOnEachFunction(
+ BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
+ [&](BinaryFunction &BF) { performRedirections(BF); }, SkipFunc,
+ "RedirectNeverTakenJumps", /*ForceSequential*/ false);
+
+ BC.outs() << format(
+ "BOLT-INFO: redirection of never-taken jumps saved %zu bytes hot "
+ "section code size and %zu bytes total code size\n",
+ TotalHotSizeSavings.load(), TotalSizeSavings.load());
+ }
+
+ return Error::success();
+}
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index b0906041833484..93f0ef482997c8 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -27,6 +27,7 @@
#include "bolt/Passes/MCF.h"
#include "bolt/Passes/PLTCall.h"
#include "bolt/Passes/PatchEntries.h"
+#include "bolt/Passes/RedirectNeverTakenJumps.h"
#include "bolt/Passes/RegReAssign.h"
#include "bolt/Passes/ReorderData.h"
#include "bolt/Passes/ReorderFunctions.h"
@@ -146,6 +147,11 @@ static cl::opt<bool>
cl::desc("print functions after peephole optimization"),
cl::Hidden, cl::cat(BoltOptCategory));
+static cl::opt<bool> PrintRedirectNeverTaken(
+ "print-redirect-never-taken",
+ cl::desc("print functions after redirecting never taken jumps"), cl::Hidden,
+ cl::cat(BoltOptCategory));
+
static cl::opt<bool>
PrintPLT("print-plt", cl::desc("print functions after PLT optimization"),
cl::Hidden, cl::cat(BoltOptCategory));
@@ -462,6 +468,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
// splitting.
Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
+ Manager.registerPass(
+ std::make_unique<RedirectNeverTakenJumps>(PrintRedirectNeverTaken));
+
// Print final dyno stats right while CFG and instruction analysis are intact.
Manager.registerPass(std::make_unique<DynoStatsPrintPass>(
"after all optimizations before SCTC and FOP"),
diff --git a/bolt/test/X86/redirect-never-taken-jumps.s b/bolt/test/X86/redirect-never-taken-jumps.s
new file mode 100644
index 00000000000000..5701383cc1ac55
--- /dev/null
+++ b/bolt/test/X86/redirect-never-taken-jumps.s
@@ -0,0 +1,86 @@
+## Test redirect-never-taken-jumps
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata1 FDATA1
+# RUN: link_fdata %s %t.o %t.fdata2 FDATA2
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN: --redirect-never-taken-jumps --print-redirect-never-taken --data=%t.fdata1 \
+# RUN: 2>&1 | FileCheck --check-prefix=CHECK_REGULAR %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN: --redirect-never-taken-jumps --data=%t.fdata2 \
+# RUN: 2>&1 | FileCheck --check-prefix=CHECK_CONSERVATIVE_DEFAULT %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN: --redirect-never-taken-jumps --conservative-never-taken-threshold=2.0 --data=%t.fdata2 \
+# RUN: 2>&1 | FileCheck --check-prefix=CHECK_CONSERVATIVE_THRESHOLD %s
+
+# RUN: llvm-bolt %t.exe -o %t.bolt --reorder-blocks=none --split-functions=1 \
+# RUN: --redirect-never-taken-jumps --aggressive-never-taken --data=%t.fdata2 \
+# RUN: 2>&1 | FileCheck --check-prefix=CHECK_AGGRRESSIVE %s
+
+# CHECK_REGULAR: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+# CHECK_REGULAR: .LBB00 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB2 (mispreds: 0, count: 0), .Ltmp2 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .Ltmp2 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB2 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .LBB2 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp0 (mispreds: 0, count: 0), .LFT0 (mispreds: 0, count: 20)
+# CHECK_REGULAR: .Ltmp3 (1 instructions, align : 1)
+# CHECK_REGULAR: HOT-COLD SPLIT POINT
+# CHECK_REGULAR: .Ltmp0 (2 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB1 (mispreds: 0, count: 0), .LBB00 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .Ltmp4 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .LBB1 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .LBB1 (2 instructions, align : 1)
+# CHECk_REGULAR: Successors: .LBB0 (mispreds: 0, count: 0), .Ltmp2 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .Ltmp1 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp4 (mispreds: 0, count: 0), .LBB0 (mispreds: 0, count: 0)
+# CHECk_REGULAR: .LBB0 (1 instructions, align : 1)
+# CHECK_REGULAR: Successors: .Ltmp3 (mispreds: 0, count: 0)
+
+# CHECK_CONSERVATIVE_DEFAULT: redirection of never-taken jumps saved 4 bytes hot section code size and 8 bytes total code size
+# CHECK_CONSERVATIVE_THRESHOLD: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+# CHECK_AGGRRESSIVE: redirection of never-taken jumps saved 4 bytes hot section code size and 12 bytes total code size
+
+
+ .globl main
+ .type main, @function
+main:
+LBB00:
+ ja Ltmp0
+# FDATA1: 1 main #LBB00# 1 main #Ltmp0# 0 0
+# FDATA1: 1 main #LBB00# 1 main #Ltmp2# 0 20
+# FDATA2: 1 main #LBB00# 1 main #Ltmp0# 0 0
+# FDATA2: 1 main #LBB00# 1 main #Ltmp2# 0 10
+Ltmp2:
+ cmpl $0,%eax
+Ltmp2Jcc:
+ ja Ltmp0
+# FDATA1: 1 main #Ltmp2Jcc# 1 main #Ltmp0# 0 0
+# FDATA1: 1 main #Ltmp2Jcc# 1 main #LFT0# 0 20
+# FDATA2: 1 main #Ltmp2Jcc# 1 main #Ltmp0# 0 0
+# FDATA2: 1 main #Ltmp2Jcc# 1 main #LFT0# 0 20
+LFT0:
+ ja Ltmp1
+# FDATA1: 1 main #LFT0# 1 main #Ltmp1# 0 0
+# FDATA1: 1 main #LFT0# 1 main #Ltmp3# 0 20
+# FDATA2: 1 main #LFT0# 1 main #Ltmp1# 0 0
+# FDATA2: 1 main #LFT0# 1 main #Ltmp3# 0 20
+Ltmp3:
+ ret
+Ltmp0:
+ jae Ltmp2
+ jmp LBB00
+Ltmp4:
+ cmpl $0,%eax
+ jae Ltmp2
+ jmp Ltmp3
+Ltmp1:
+ je Ltmp4
+ jmp Ltmp3
+.LLmain_end:
+ .size main, .LLmain_end-main
>From 17f47b2d3cb61d935a0f83d5bc54dc98b36a231e Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 28 Oct 2024 12:07:43 -0700
Subject: [PATCH 2/2] fixup! [BOLT][X86]Redirect never-taken jumps
---
bolt/lib/Passes/RedirectNeverTakenJumps.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
index 418f055beff306..65bdc6637bbca8 100644
--- a/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
+++ b/bolt/lib/Passes/RedirectNeverTakenJumps.cpp
@@ -112,9 +112,9 @@ void createJumps(BinaryFunction &Function, FunctionFragment &Fragment,
if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE)
ExecCount = BI.Count;
- const uint64_t JumpEndAddr = TargetBB->getOutputStartAddress();
+ const uint64_t JumpEndAddr = TargetBB->getOutputAddressRange().first;
const uint64_t JumpStartAddr =
- SourceBB->getOutputEndAddress() - OffsetFromBlockEnd;
+ SourceBB->getOutputAddressRange().second - OffsetFromBlockEnd;
const uint8_t LongJumpSize =
IsUnconditional ? LongUncondJumpSize : LongCondJumpSize;
const uint8_t JumpInstrSize =
More information about the llvm-commits
mailing list