[llvm] [BOLT][AArch64] Add call relaxation pass (PR #173952)
Maksim Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 11 12:54:45 PST 2026
================
@@ -944,13 +953,306 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
}
}
+void LongJmpPass::relaxCalls(BinaryContext &BC) {
+ // Operate on a copy of binary functions. We are going to manually insert new
+ // thunks and update the list.
+ BinaryFunctionListType OutputFunctions = BC.getOutputBinaryFunctions();
+
+ // Conservatively estimate emitted function size. Assume the worst case
+ // alignment.
+ auto estimateFunctionSize = [&](const BinaryFunction &BF) -> uint64_t {
+ if (!BC.shouldEmit(BF))
+ return 0;
+ uint64_t Size = BF.estimateSize();
+ if (BF.hasValidIndex())
+ Size += BF.getAlignment();
+
+ if (BF.hasIslandsInfo()) {
+ Size += BF.getConstantIslandAlignment();
+ Size += BF.estimateConstantIslandSize();
+ }
+
+ return Size;
+ };
+
+ // Map every function to its direct callees. Note that this is different from
+ // the regular call graph as here we completely ignore indirect calls.
+ uint64_t EstimatedSize = 0;
+ DenseMap<BinaryFunction *, std::set<const MCSymbol *>> CallMap;
+ for (BinaryFunction *BF : OutputFunctions) {
+ if (!BC.shouldEmit(*BF) || BF->isPatch())
+ continue;
+
+ EstimatedSize += estimateFunctionSize(*BF);
+
+ for (const BinaryBasicBlock &BB : *BF) {
+ for (const MCInst &Inst : BB) {
+ if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+ BC.MIB->isIndirectBranch(Inst))
+ continue;
+ const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+ assert(TargetSymbol);
+
+ // Ignore internal calls that use basic block labels as a destination.
+ if (!BC.getFunctionForSymbol(TargetSymbol))
+ continue;
+
+ CallMap[BF].insert(TargetSymbol);
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LongJmp: estimated code size : " << EstimatedSize
+ << '\n');
+
+ // Build clusters in the order the functions will appear in the output.
+ std::vector<FunctionCluster> Clusters;
+ for (size_t Index = 0, NumFuncs = OutputFunctions.size(); Index < NumFuncs;
+ ++Index) {
+ const size_t BFIndex =
+ opts::HotFunctionsAtEnd ? NumFuncs - Index - 1 : Index;
+ BinaryFunction *BF = OutputFunctions[BFIndex];
+ if (!BC.shouldEmit(*BF) || BF->isPatch())
+ continue;
+
+ const uint64_t BFSize = estimateFunctionSize(*BF);
+ if (Clusters.empty() || Clusters.back().Size + BFSize > MaxClusterSize) {
+ Clusters.emplace_back(FunctionCluster());
+ Clusters.back().FirstFunctionIndex = BFIndex;
+ }
+
+ FunctionCluster &FC = Clusters.back();
+ FC.Functions.insert(BF);
+
+ // When a function is added to the cluster, we have to remove all of its
+ // symbols from the cluster callee list. These include alternative symbols
+ // (e.g. after ICF) and secondary entry point symbols.
+ for (const MCSymbol *Symbol : BF->getSymbols()) {
+ auto It = FC.Callees.find(Symbol);
+ if (It != FC.Callees.end())
+ FC.Callees.erase(It);
+ }
+ BF->forEachEntryPoint(
+ [&FC](uint64_t Offset, const MCSymbol *EntrySymbol) -> bool {
+ auto It = FC.Callees.find(EntrySymbol);
+ if (It != FC.Callees.end())
+ FC.Callees.erase(It);
+ return true;
+ });
+
+ // Update cluster callee list with added function callees.
+ for (const MCSymbol *CalleeSymbol : CallMap[BF]) {
+ BinaryFunction *Callee = BC.getFunctionForSymbol(CalleeSymbol);
+ if (!FC.Functions.count(Callee)) {
+ FC.Callees.insert(CalleeSymbol);
+ }
+ }
+
+ FC.Size += BFSize;
+ FC.LastFunctionIndex = BFIndex;
+ }
+
+ if (opts::HotFunctionsAtEnd) {
+ std::reverse(Clusters.begin(), Clusters.end());
+ llvm::for_each(Clusters, [](FunctionCluster &FC) {
+ std::swap(FC.LastFunctionIndex, FC.FirstFunctionIndex);
+ });
+ }
+
+ if (Clusters.empty())
+ return;
+
+ // Print cluster stats.
+ BC.outs() << "BOLT-INFO: built " << Clusters.size()
+ << " function cluster(s)\n";
+ uint64_t ClusterIndex = 0;
+ for (const FunctionCluster &FC : Clusters) {
+ BC.outs() << "BOLT-INFO: cluster: " << ClusterIndex++ << '\n'
+ << "BOLT-INFO: " << FC.Functions.size() << " function(s)\n"
+ << "BOLT-INFO: " << FC.Callees.size() << " callee(s)\n"
+ << "BOLT-INFO: " << FC.Size << " estimated bytes\n";
+ }
+
+ if (opts::RelaxPLT) {
+ // Populate one of the clusters with PLT functions based on the proximity of
+ // the PLT section to avoid unneeded thunk redirection.
+ const size_t PLTClusterNum = opts::UseOldText ? Clusters.size() - 1 : 0;
+ auto &PLTCluster = Clusters[PLTClusterNum];
+ for (BinaryFunction &BF :
+ llvm::make_second_range(BC.getBinaryFunctions())) {
+ if (BF.isPLTFunction()) {
+ PLTCluster.Functions.insert(&BF);
+ auto It = PLTCluster.Callees.find(BF.getSymbol());
+ if (It != PLTCluster.Callees.end())
+ PLTCluster.Callees.erase(It);
+ }
+ }
+ }
+
+ // Create a thunk with +-128MB span.
+ size_t NumShortThunks = 0;
+ auto createShortThunk = [&](const MCSymbol *TargetSymbol) {
+ ++NumShortThunks;
+ BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+ "__AArch64Thunk_" + TargetSymbol->getName().str());
+ MCInst Inst;
+ BC.MIB->createTailCall(Inst, TargetSymbol, BC.Ctx.get());
+ ThunkBF->addBasicBlock()->addInstruction(Inst);
+
+ return ThunkBF;
+ };
+
+ // Create a thunk with +-4GB span.
+ size_t NumLongThunks = 0;
+ auto createLongThunk = [&](const MCSymbol *TargetSymbol) {
+ ++NumLongThunks;
+ BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+ "__AArch64ADRPThunk_" + TargetSymbol->getName().str());
+ InstructionListType Instructions;
+ BC.MIB->createLongTailCall(Instructions, TargetSymbol, BC.Ctx.get());
+ ThunkBF->addBasicBlock()->addInstructions(Instructions);
+
+ return ThunkBF;
+ };
+
+ for (unsigned ClusterNum = 0; ClusterNum < Clusters.size(); ++ClusterNum) {
+ FunctionCluster &FC = Clusters[ClusterNum];
+ SmallVector<const MCSymbol *, 16> Callees(FC.Callees.begin(),
+ FC.Callees.end());
+
+ // Generate thunks in deterministic order.
+ llvm::sort(Callees, [&BC](const MCSymbol *A, const MCSymbol *B) {
+ uint64_t EntryA;
+ uint64_t EntryB;
+ BinaryFunction *BFA = BC.getFunctionForSymbol(A, &EntryA);
+ BinaryFunction *BFB = BC.getFunctionForSymbol(B, &EntryB);
+ if (BFA == BFB) {
+ if (EntryA != EntryB)
+ return EntryA < EntryB;
+
+ // Use lexicographical order for ICF'ed symbols.
+ return A->getName() < B->getName();
+ }
+ return compareBinaryFunctionByIndex(BFA, BFB);
+ });
+
+ // Return index of adjacent cluster containing the function.
+ auto getAdjClusterWithFunction =
+ [&](const BinaryFunction *BF) -> std::optional<unsigned> {
+ if (ClusterNum > 0 && Clusters[ClusterNum - 1].Functions.count(BF))
+ return ClusterNum - 1;
+ if (ClusterNum + 1 < Clusters.size() &&
+ Clusters[ClusterNum + 1].Functions.count(BF))
+ return ClusterNum + 1;
+ return std::nullopt;
+ };
+
+ const FunctionCluster *PrevCluster =
+ ClusterNum ? &Clusters[ClusterNum - 1] : nullptr;
+
+ // Create short thunks for callees in adjacent clusters and long thunks
+ // for callees outside.
+ for (const MCSymbol *Callee : Callees) {
+ if (FC.Thunks.count(Callee))
+ continue;
+
+ BinaryFunction *Thunk = 0;
+ std::optional<unsigned> AdjCluster =
+ getAdjClusterWithFunction(BC.getFunctionForSymbol(Callee));
+ if (AdjCluster) {
+ Thunk = createShortThunk(Callee);
+ } else {
+ // Previous cluster may already have a long thunk that can be reused.
+ if (PrevCluster) {
+ auto It = PrevCluster->Thunks.find(Callee);
+ // Reuse only if previous cluster hosts this thunk.
+ if (It != PrevCluster->Thunks.end() &&
+ llvm::is_contained(PrevCluster->ThunkList, It->second)) {
+ FC.Thunks[Callee] = It->second;
+ continue;
+ }
+ }
+ Thunk = createLongThunk(Callee);
+ }
+
+ // The cluster that will host this thunk. If the current cluster is the
+ // last one, try to use the previous one. Matters when we want to have hot
+ // functions at higher addresses under HotFunctionsAtEnd.
+ FunctionCluster *ThunkCluster = &Clusters[ClusterNum];
+ if ((AdjCluster && *AdjCluster == ClusterNum - 1) ||
+ (ClusterNum && ClusterNum == Clusters.size() - 1))
+ ThunkCluster = &Clusters[ClusterNum - 1];
+ ThunkCluster->ThunkList.push_back(Thunk);
+
+ // Register thunks for all symbols associated with the function.
+ uint64_t EntryID = 0;
+ const BinaryFunction *BF = BC.getFunctionForSymbol(Callee, &EntryID);
+ if (EntryID != 0) {
+ FC.Thunks[Callee] = Thunk;
+ } else {
+ for (const MCSymbol *Symbol : BF->getSymbols()) {
+ FC.Thunks[Symbol] = Thunk;
+ }
+ }
+ }
+ }
+
+ if (NumShortThunks)
+ BC.outs() << "BOLT-INFO: " << NumShortThunks << " short thunks created\n";
+
+ if (NumLongThunks)
+ BC.outs() << "BOLT-INFO: " << NumLongThunks << " long thunks created\n";
+
+ // Replace callees with thunks.
+ for (FunctionCluster &FC : Clusters) {
+ for (BinaryFunction *BF : FC.Functions) {
+ if (!CallMap.count(BF))
+ continue;
+
+ for (BinaryBasicBlock &BB : *BF) {
+ for (MCInst &Inst : BB) {
+ if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+ BC.MIB->isIndirectBranch(Inst))
+ continue;
+ const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+ assert(TargetSymbol);
+
+ auto It = FC.Thunks.find(TargetSymbol);
+ if (It != FC.Thunks.end())
+ BC.MIB->replaceBranchTarget(Inst, It->second->getSymbol(),
+ BC.Ctx.get());
+ }
+ }
+ }
+ }
+
+ // Add thunks to the function list and assign a section name matching the
+ // function they follow.
+ for (const FunctionCluster &FC : llvm::reverse(Clusters)) {
+ std::string SectionName =
+ OutputFunctions[FC.LastFunctionIndex]->getCodeSectionName().str().str();
+ for (BinaryFunction *Thunk : FC.ThunkList) {
+ Thunk->setCodeSectionName(SectionName);
+ }
+
+ OutputFunctions.insert(
+ std::next(OutputFunctions.begin(), FC.LastFunctionIndex + 1),
+ FC.ThunkList.begin(), FC.ThunkList.end());
+ }
+
+ LLVM_DEBUG(dbgs() << "\nFunction layout with thunks:\n";
+ for (const auto *BF : OutputFunctions) { dbgs() << *BF << '\n'; });
+
+ BC.updateOutputBinaryFunctions(std::move(OutputFunctions));
+}
+
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
- assert((opts::CompactCodeModel ||
+ assert((opts::CompactCodeModel || opts::ExperimentalRelaxation ||
opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit) &&
"LongJmp cannot work with functions split in more than two fragments");
- if (opts::CompactCodeModel) {
+ if (opts::CompactCodeModel || opts::ExperimentalRelaxation) {
----------------
maksfb wrote:
The call relaxation pass relies on the compact code model to perform the local branch relaxation, i.e. the condition is correct and so is the message. Let me add this dependency explicitly.
https://github.com/llvm/llvm-project/pull/173952
More information about the llvm-commits
mailing list