[llvm] [BOLT][AArch64] Add call relaxation pass (PR #173952)

Sun Jan 11 13:02:46 PST 2026

================
@@ -944,13 +953,306 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
   }
 }
 
+void LongJmpPass::relaxCalls(BinaryContext &BC) {
+  // Operate on a copy of binary functions. We are going to manually insert new
+  // thunks and update the list.
+  BinaryFunctionListType OutputFunctions = BC.getOutputBinaryFunctions();
+
+  // Conservatively estimate emitted function size. Assume the worst case
+  // alignment.
+  auto estimateFunctionSize = [&](const BinaryFunction &BF) -> uint64_t {
+    if (!BC.shouldEmit(BF))
+      return 0;
+    uint64_t Size = BF.estimateSize();
+    if (BF.hasValidIndex())
+      Size += BF.getAlignment();
+
+    if (BF.hasIslandsInfo()) {
+      Size += BF.getConstantIslandAlignment();
+      Size += BF.estimateConstantIslandSize();
+    }
+
+    return Size;
+  };
+
+  // Map every function to its direct callees. Note that this is different from
+  // the regular call graph as here we completely ignore indirect calls.
+  uint64_t EstimatedSize = 0;
+  DenseMap<BinaryFunction *, std::set<const MCSymbol *>> CallMap;
+  for (BinaryFunction *BF : OutputFunctions) {
+    if (!BC.shouldEmit(*BF) || BF->isPatch())
+      continue;
+
+    EstimatedSize += estimateFunctionSize(*BF);
+
+    for (const BinaryBasicBlock &BB : *BF) {
+      for (const MCInst &Inst : BB) {
+        if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+            BC.MIB->isIndirectBranch(Inst))
+          continue;
+        const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+        assert(TargetSymbol);
+
+        // Ignore internal calls that use basic block labels as a destination.
+        if (!BC.getFunctionForSymbol(TargetSymbol))
+          continue;
+
+        CallMap[BF].insert(TargetSymbol);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LongJmp: estimated code size : " << EstimatedSize
+                    << '\n');
+
+  // Build clusters in the order the functions will appear in the output.
+  std::vector<FunctionCluster> Clusters;
+  for (size_t Index = 0, NumFuncs = OutputFunctions.size(); Index < NumFuncs;
+       ++Index) {
+    const size_t BFIndex =
+        opts::HotFunctionsAtEnd ? NumFuncs - Index - 1 : Index;
+    BinaryFunction *BF = OutputFunctions[BFIndex];
+    if (!BC.shouldEmit(*BF) || BF->isPatch())
+      continue;
+
+    const uint64_t BFSize = estimateFunctionSize(*BF);
+    if (Clusters.empty() || Clusters.back().Size + BFSize > MaxClusterSize) {
+      Clusters.emplace_back(FunctionCluster());
+      Clusters.back().FirstFunctionIndex = BFIndex;
+    }
+
+    FunctionCluster &FC = Clusters.back();
+    FC.Functions.insert(BF);
+
+    // When a function is added to the cluster, we have to remove all of its
+    // symbols from the cluster callee list. These include alternative symbols
+    // (e.g. after ICF) and secondary entry point symbols.
+    for (const MCSymbol *Symbol : BF->getSymbols()) {
+      auto It = FC.Callees.find(Symbol);
+      if (It != FC.Callees.end())
+        FC.Callees.erase(It);
+    }
+    BF->forEachEntryPoint(
+        [&FC](uint64_t Offset, const MCSymbol *EntrySymbol) -> bool {
+          auto It = FC.Callees.find(EntrySymbol);
+          if (It != FC.Callees.end())
+            FC.Callees.erase(It);
+          return true;
+        });
+
+    // Update cluster callee list with added function callees.
+    for (const MCSymbol *CalleeSymbol : CallMap[BF]) {
+      BinaryFunction *Callee = BC.getFunctionForSymbol(CalleeSymbol);
+      if (!FC.Functions.count(Callee)) {
+        FC.Callees.insert(CalleeSymbol);
+      }
+    }
+
+    FC.Size += BFSize;
+    FC.LastFunctionIndex = BFIndex;
+  }
+
+  if (opts::HotFunctionsAtEnd) {
+    std::reverse(Clusters.begin(), Clusters.end());
+    llvm::for_each(Clusters, [](FunctionCluster &FC) {
+      std::swap(FC.LastFunctionIndex, FC.FirstFunctionIndex);
+    });
+  }
+
+  if (Clusters.empty())
+    return;
+
+  // Print cluster stats.
+  BC.outs() << "BOLT-INFO: built " << Clusters.size()
+            << " function cluster(s)\n";
+  uint64_t ClusterIndex = 0;
+  for (const FunctionCluster &FC : Clusters) {
+    BC.outs() << "BOLT-INFO: cluster: " << ClusterIndex++ << '\n'
+              << "BOLT-INFO:   " << FC.Functions.size() << " function(s)\n"
+              << "BOLT-INFO:   " << FC.Callees.size() << " callee(s)\n"
+              << "BOLT-INFO:   " << FC.Size << " estimated bytes\n";
+  }
+
+  if (opts::RelaxPLT) {
+    // Populate one of the clusters with PLT functions based on the proximity of
+    // the PLT section to avoid unneeded thunk redirection.
+    const size_t PLTClusterNum = opts::UseOldText ? Clusters.size() - 1 : 0;
+    auto &PLTCluster = Clusters[PLTClusterNum];
+    for (BinaryFunction &BF :
+         llvm::make_second_range(BC.getBinaryFunctions())) {
+      if (BF.isPLTFunction()) {
+        PLTCluster.Functions.insert(&BF);
+        auto It = PLTCluster.Callees.find(BF.getSymbol());
+        if (It != PLTCluster.Callees.end())
+          PLTCluster.Callees.erase(It);
+      }
+    }
+  }
+
+  // Create a thunk with +-128MB span.
+  size_t NumShortThunks = 0;
+  auto createShortThunk = [&](const MCSymbol *TargetSymbol) {
+    ++NumShortThunks;
+    BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+        "__AArch64Thunk_" + TargetSymbol->getName().str());
+    MCInst Inst;
+    BC.MIB->createTailCall(Inst, TargetSymbol, BC.Ctx.get());
+    ThunkBF->addBasicBlock()->addInstruction(Inst);
+
+    return ThunkBF;
+  };
+
+  // Create a thunk with +-4GB span.
+  size_t NumLongThunks = 0;
+  auto createLongThunk = [&](const MCSymbol *TargetSymbol) {
+    ++NumLongThunks;
+    BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+        "__AArch64ADRPThunk_" + TargetSymbol->getName().str());
+    InstructionListType Instructions;
+    BC.MIB->createLongTailCall(Instructions, TargetSymbol, BC.Ctx.get());
+    ThunkBF->addBasicBlock()->addInstructions(Instructions);
+
+    return ThunkBF;
+  };
+
+  for (unsigned ClusterNum = 0; ClusterNum < Clusters.size(); ++ClusterNum) {
+    FunctionCluster &FC = Clusters[ClusterNum];
+    SmallVector<const MCSymbol *, 16> Callees(FC.Callees.begin(),
+                                              FC.Callees.end());
+
+    // Generate thunks in deterministic order.
+    llvm::sort(Callees, [&BC](const MCSymbol *A, const MCSymbol *B) {
+      uint64_t EntryA;
+      uint64_t EntryB;
+      BinaryFunction *BFA = BC.getFunctionForSymbol(A, &EntryA);
+      BinaryFunction *BFB = BC.getFunctionForSymbol(B, &EntryB);
+      if (BFA == BFB) {
+        if (EntryA != EntryB)
+          return EntryA < EntryB;
+
+        // Use lexicographical order for ICF'ed symbols.
+        return A->getName() < B->getName();
+      }
+      return compareBinaryFunctionByIndex(BFA, BFB);
+    });
+
+    // Return index of adjacent cluster containing the function.
+    auto getAdjClusterWithFunction =
+        [&](const BinaryFunction *BF) -> std::optional<unsigned> {
+      if (ClusterNum > 0 && Clusters[ClusterNum - 1].Functions.count(BF))
+        return ClusterNum - 1;
+      if (ClusterNum + 1 < Clusters.size() &&
+          Clusters[ClusterNum + 1].Functions.count(BF))
+        return ClusterNum + 1;
+      return std::nullopt;
+    };
+
+    const FunctionCluster *PrevCluster =
+        ClusterNum ? &Clusters[ClusterNum - 1] : nullptr;
+
+    // Create short thunks for callees in adjacent clusters and long thunks
+    // for callees outside.
+    for (const MCSymbol *Callee : Callees) {
+      if (FC.Thunks.count(Callee))
+        continue;
+
+      BinaryFunction *Thunk = 0;
+      std::optional<unsigned> AdjCluster =
+          getAdjClusterWithFunction(BC.getFunctionForSymbol(Callee));
+      if (AdjCluster) {
+        Thunk = createShortThunk(Callee);
+      } else {
+        // Previous cluster may already have a long thunk that can be reused.
+        if (PrevCluster) {
+          auto It = PrevCluster->Thunks.find(Callee);
+          // Reuse only if previous cluster hosts this thunk.
+          if (It != PrevCluster->Thunks.end() &&
+              llvm::is_contained(PrevCluster->ThunkList, It->second)) {
+            FC.Thunks[Callee] = It->second;
+            continue;
+          }
+        }
+        Thunk = createLongThunk(Callee);
+      }
+
+      // The cluster that will host this thunk. If the current cluster is the
+      // last one, try to use the previous one. Matters when we want to have hot
+      // functions at higher addresses under HotFunctionsAtEnd.
+      FunctionCluster *ThunkCluster = &Clusters[ClusterNum];
+      if ((AdjCluster && *AdjCluster == ClusterNum - 1) ||
+          (ClusterNum && ClusterNum == Clusters.size() - 1))
+        ThunkCluster = &Clusters[ClusterNum - 1];
+      ThunkCluster->ThunkList.push_back(Thunk);
+
+      // Register thunks for all symbols associated with the function.
+      uint64_t EntryID = 0;
+      const BinaryFunction *BF = BC.getFunctionForSymbol(Callee, &EntryID);
+      if (EntryID != 0) {
+        FC.Thunks[Callee] = Thunk;
+      } else {
+        for (const MCSymbol *Symbol : BF->getSymbols()) {
+          FC.Thunks[Symbol] = Thunk;
+        }
+      }
+    }
+  }
+
+  if (NumShortThunks)
----------------
maksfb wrote:

Will you find it useful? Normally, no thunks will be created when all code will fit into a single cluster and such information becomes redundant.

https://github.com/llvm/llvm-project/pull/173952