[clang] [lld] [llvm] [ThinLTO][Split] Split module for parallel compilation in backend (1/N) (PR #198702)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 12 01:55:54 PDT 2026
https://github.com/mmjjpp updated https://github.com/llvm/llvm-project/pull/198702
>From d9fc0bd6f1d5767f87eb515a665e76f6d00d1ea4 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 11:22:30 +0800
Subject: [PATCH 1/7] [ThinLTO][Split] Split module for parallel compilation in
backend
An interface for splitting a module by callgraph is added. This
interface is called in the thinlto backend phase. The module is
split into N Mparts, and opt and codegen are performed on the
Mparts in parallel to implement parallel compilation in the
thinlto backend.
---
.../llvm/Transforms/Utils/SplitModuleCG.h | 34 ++
llvm/lib/LTO/LTOBackend.cpp | 292 +++++++++++++++++-
llvm/lib/Transforms/Utils/CMakeLists.txt | 1 +
llvm/lib/Transforms/Utils/SplitModuleCG.cpp | 26 ++
4 files changed, 336 insertions(+), 17 deletions(-)
create mode 100644 llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
create mode 100644 llvm/lib/Transforms/Utils/SplitModuleCG.cpp
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
new file mode 100644
index 0000000000000..e60c4e931d40c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
@@ -0,0 +1,34 @@
+#ifndef LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
+#define LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/LTO/Config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace llvm {
+/// Splits the module M into N linkable partitions. The function ModuleCallback
+/// is called N times passing each individual partition as the MPart argument.
+class SplitModuleCG {
+public:
+ using ModuleCreationCallback =
+ function_ref<void(std::unique_ptr<Module> MPart, unsigned PartitionId)>;
+ SplitModuleCG(Module &M,
+ const ModuleSummaryIndex &CombinedIndex,
+ unsigned LimitPartition = 0);
+ void SplitModule(ModuleCreationCallback ModuleCallback,
+ const llvm::lto::Config &C);
+
+ unsigned getPartitionNum() { return N; }
+
+ private:
+ unsigned N;
+ Module &M;
+ CallGraph CG;
+ DenseSet<const Function *> EntryFuncs;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 73697a9d0d446..11200ade0e8c0 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -34,8 +34,10 @@
#include "llvm/Plugins/PassPlugin.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Support/VirtualFileSystem.h"
@@ -45,6 +47,8 @@
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
#include "llvm/Transforms/Utils/FunctionImportUtils.h"
#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
+#include <filesystem>
#include <optional>
using namespace llvm;
@@ -80,6 +84,23 @@ static cl::list<std::string>
"path matches this for -save-temps options"),
cl::CommaSeparated, cl::Hidden);
+static cl::opt<unsigned> ThinLTOSplitModuleSizeThreshold(
+ "thinlto-split-module-size-threshold", cl::Hidden, cl::init(500),
+ cl::desc("Control the amount of whether split in thinlto backend"
+ "accroding to the size of a module."));
+
+static cl::opt<float> ThinLTOSplitModuleSizeRateThreshold(
+ "thinlto-split-module-size-rate-threshold", cl::Hidden, cl::init(0.5),
+ cl::desc("Whether to split in thinlto backend based on the ratio of "
+ "(callgraph size)/(module size)"));
+
+static cl::opt<unsigned> ThinLTOSplitPartitions(
+ "thinlto-split-partitions", cl::Hidden, cl::init(0),
+ cl::desc("Control split to how many partitions in thinlto backend."));
+
+static cl::opt<bool> ThinLTOSplit("thinlto-split", cl::init(false),
+ cl::desc("Enable split module in thinlto backend."));
+
namespace llvm {
extern cl::opt<bool> NoPGOWarnMismatch;
}
@@ -124,12 +145,19 @@ Error Config::addSaveTemps(std::string OutputFileName, bool UseInputModulePath,
if (LinkerHook && !LinkerHook(Task, M))
return false;
+ auto extract_filename = [](const std::string &path) -> std::string {
+ std::filesystem::path fs_path(path);
+ return fs_path.filename().string();
+ };
+
std::string PathPrefix;
// If this is the combined module (not a ThinLTO backend compile) or the
// user hasn't requested using the input module's path, emit to a file
// named from the provided OutputFileName with the Task ID appended.
if (M.getModuleIdentifier() == "ld-temp.o" || !UseInputModulePath) {
PathPrefix = OutputFileName;
+ if (ThinLTOSplit)
+ PathPrefix += extract_filename(M.getSourceFileName()) + ".";
if (Task != (unsigned)-1)
PathPrefix += utostr(Task) + ".";
} else
@@ -513,6 +541,212 @@ static void codegen(const Config &Conf, TargetMachine *TM,
report_fatal_error(std::move(Err));
}
+static unsigned calFunctionSize(const llvm::Function &F) {
+ unsigned size = 0;
+ for (const auto &BB : F)
+ size += std::distance(BB.begin(), BB.end());
+ return size;
+}
+
+static unsigned calModuleSize(const llvm::Module &M) {
+ unsigned size = 0;
+ for (const auto &F : M)
+ size += calFunctionSize(F);
+ return size;
+}
+
+static bool canDoSplitModule(const llvm::Module &M) {
+ if (calModuleSize(M) < ThinLTOSplitModuleSizeThreshold)
+ return false;
+ return true;
+}
+
+static bool HasLargeCG(Module &Mod, const ModuleSummaryIndex &CombinedIndex) {
+ // TODO: Check whether there has large callgraphs. When multiple callgraphs
+ // are split, thinlto parallel compilation can bring benefits.
+ return true;
+}
+
+struct TaskIdAllocator {
+ using TaskId = unsigned;
+
+ // Use the most significant bit (MSB) as a namespace tag.
+ // - Original ThinLTO backend tasks are expected to have MSB == 0.
+ // - Split partitions allocated by this allocator always have MSB == 1.
+ // This guarantees the two ID spaces never overlap.
+ static constexpr TaskId tag() {
+ return TaskId{1} << (std::numeric_limits<TaskId>::digits - 1);
+ }
+
+ // Monotonic sequence counter for split partitions (MSB must remain 0 here).
+ std::atomic<TaskId> seq{0};
+
+ // Allocate a globally unique TaskId for a split partition.
+ // The returned ID is `tag() | seq`, so it lives in the MSB==1 namespace.
+ TaskId alloc() {
+ TaskId v = seq.fetch_add(1, std::memory_order_relaxed);
+
+ // If the counter ever reaches the MSB, we'd overlap namespaces.
+ // This indicates an overflow / too many partitions.
+ if (v & tag())
+ report_fatal_error("Partition TaskId overflow: seq reached the tag bit.");
+
+ return tag() | v;
+ }
+
+ // Helper for sanity checks / debugging.
+ static bool isPartition(TaskId id) { return (id & tag()) != 0; }
+};
+
+// Global allocator shared by all split partitions.
+static TaskIdAllocator gSplitTaskIds;
+
+static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
+ TargetMachine *TM, AddStreamFn AddStream,
+ unsigned ParallelCodeGenParallelismLevel,
+ Module &Mod,
+ const ModuleSummaryIndex &CombinedIndex,
+ const std::vector<uint8_t> &CmdArgs,
+ bool DoOpt, AddStreamFn IRAddStream,
+ ArrayRef<StringRef> &BitcodeLibFuncs) {
+ unsigned ThreadCount = 0;
+ const Target *T = &TM->getTarget();
+
+ static std::mutex PrintMutex;
+
+ SplitModuleCG SplitModuleCG(Mod, CombinedIndex, ParallelCodeGenParallelismLevel);
+ ParallelCodeGenParallelismLevel = SplitModuleCG.getPartitionNum();
+
+ std::vector<std::string> TempObjectFiles(ParallelCodeGenParallelismLevel);
+ std::vector<llvm::FileRemover> TempFileRemovers(ParallelCodeGenParallelismLevel);
+
+ const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
+ unsigned PartitionId) {
+ unsigned CurrentThreadId, UniqueTaskId;
+ {
+ std::lock_guard<std::mutex> Lock(PrintMutex);
+ CurrentThreadId = ThreadCount++;
+
+ // In distributed ThinLTO, `task` may be a sentinel (e.g. -1 cast to
+ // unsigned), which becomes UINT_MAX and naturally has MSB==1. Treat it
+ // as "no base task id" and don't enforce the namespace check on it.
+ //
+ // We do not rely on the incoming `task` for partition uniqueness: split
+ // partitions get a dedicated UniqueTaskId allocated below.
+ if (task != std::numeric_limits<unsigned>::max()) {
+ assert(!TaskIdAllocator::isPartition(task) &&
+ "Original ThinLTO TaskId unexpectedly overlaps the partition "
+ "namespace");
+ }
+ UniqueTaskId = gSplitTaskIds.alloc();
+ }
+
+ std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
+
+ if (DoOpt) {
+ if (!opt(C, ThreadTM.get(), UniqueTaskId, *MPart, /*IsThinLTO=*/true,
+ /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
+ CmdArgs, BitcodeLibFuncs)) {
+ report_fatal_error("Failed to gen opt for split mod in thread.");
+ }
+
+ // Save the current module before the first codegen round.
+ // Note that the second codegen round runs only `codegen()` without
+ // running `opt()`. We're not reaching here as it's bailed out earlier
+ // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
+ if (IRAddStream)
+ cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
+ IRAddStream);
+ }
+
+ auto splitStream = [&](unsigned task, const Twine &moduleName)
+ -> Expected<std::unique_ptr<CachedFileStream>> {
+ int FD;
+ SmallString<128> TempFilename;
+ if (std::error_code EC = sys::fs::createTemporaryFile(
+ "thinlto-split", "o", FD, TempFilename))
+ return errorCodeToError(EC);
+
+ TempObjectFiles[PartitionId] = std::string(TempFilename.str());
+ TempFileRemovers[PartitionId].setFile(TempObjectFiles[PartitionId]);
+
+ auto OS = std::make_unique<raw_fd_ostream>(
+ FD, true, /*CloseOnDestruct*/true);
+
+ auto Stream = std::make_unique<CachedFileStream>(
+ std::move(OS), std::string(TempFilename.str()));
+
+ return std::move(Stream);
+ };
+
+ codegen(C, ThreadTM.get(), splitStream, UniqueTaskId, *MPart,
+ CombinedIndex);
+ };
+
+ SplitModuleCG.SplitModule(HandleModulePartition, C);
+
+ // Use ld.lld to combine the partitions into a object.
+ if (TempObjectFiles.empty()) {
+ llvm::errs() << "TempObjectFiles.empty()\n";
+ return true;
+ }
+
+ auto FinalStream = AddStream(task, Mod.getModuleIdentifier());
+ if (!FinalStream)
+ report_fatal_error("Failed to open final output stream");
+
+ int MergedFD;
+ SmallString<128> MergedFilename;
+ if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFD,
+ MergedFilename))
+ report_fatal_error("Failed to create merged temp file.");
+ llvm::FileRemover MergedFileRemover(MergedFilename);
+ sys::fs::closeFile(MergedFD);
+
+ std::vector<StringRef> Args;
+ std::string LinkerPath = "";
+ if (auto Path = sys::findProgramByName("ld.lld"))
+ LinkerPath = *Path;
+ else if (auto Path = sys::findProgramByName("ld"))
+ LinkerPath = *Path;
+
+ if (LinkerPath.empty())
+ report_fatal_error("Cannot find linkeer (ld or ld.lld) to merge partitions.");
+
+ Args.push_back(LinkerPath);
+ Args.push_back("-r");
+ Args.push_back("-o");
+ Args.push_back(MergedFilename);
+
+ for (const auto &File : TempObjectFiles)
+ Args.push_back(File);
+
+ std::string ErrMsg;
+ int Result = sys::ExecuteAndWait(LinkerPath, Args, /*Env=*/std::nullopt,
+ /*Redirects=*/{}, /*SecondsToWait=*/0,
+ /*MemoryLimit=*/0, &ErrMsg);
+
+ if (Result != 0) {
+ errs() << "Linker failed: " << ErrMsg << "\n";
+ report_fatal_error("Failed to merge split objects.");
+ }
+
+ {
+ std::unique_ptr<CachedFileStream> &FinalFileStream = *FinalStream;
+ auto BufferOrErr = MemoryBuffer::getFile(MergedFilename);
+ if (!BufferOrErr)
+ report_fatal_error("Failed to read merged object.");
+
+ FinalFileStream->OS->write(BufferOrErr.get()->getBufferStart(),
+ BufferOrErr.get()->getBufferSize());
+ if (Error Err = FinalFileStream->commit()) {
+ report_fatal_error(Twine("Failed to commit final file stream: ") +
+ toString(std::move(Err)));
+ }
+ }
+ return true;
+}
+
static void splitCodeGen(const Config &C, TargetMachine *TM,
AddStreamFn AddStream,
unsigned ParallelCodeGenParallelismLevel, Module &Mod,
@@ -677,11 +911,28 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
// the module, if applicable.
Mod.setPartialSampleProfileRatio(CombinedIndex);
+ bool ProfitableToSplit = true;
+ if (ThinLTOSplit) {
+ if (!canDoSplitModule(Mod) || !HasLargeCG(Mod, CombinedIndex)) {
+ ProfitableToSplit = false;
+ LLVM_DEBUG(dbgs() << "warning: thinlto split not enable for module: "
+ << Mod.getName());
+ } else {
+ LLVM_DEBUG(dbgs() << "thinlto: split codegen for module: "
+ << Mod.getName());
+ }
+ }
+
LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
if (CodeGenOnly) {
- // If CodeGenOnly is set, we only perform code generation and skip
- // optimization. This value may differ from Conf.CodeGenOnly.
- codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
+ if (ThinLTOSplit && ProfitableToSplit)
+ splitOptAndCodeGenThin(Task, Conf, TM.get(), AddStream,
+ ThinLTOSplitPartitions, Mod, CombinedIndex,
+ CmdArgs, false, IRAddStream, BitcodeLibFuncs);
+ else
+ // If CodeGenOnly is set, we only perform code generation and skip
+ // optimization. This value may differ from Conf.CodeGenOnly.
+ codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
}
@@ -691,20 +942,27 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
auto OptimizeAndCodegen =
[&](Module &Mod, TargetMachine *TM,
LLVMRemarkFileHandle DiagnosticOutputFile) {
- // Perform optimization and code generation for ThinLTO.
- if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
- /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
- CmdArgs, BitcodeLibFuncs))
- return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
-
- // Save the current module before the first codegen round.
- // Note that the second codegen round runs only `codegen()` without
- // running `opt()`. We're not reaching here as it's bailed out earlier
- // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
- if (IRAddStream)
- cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
-
- codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+ if (ThinLTOSplit && ProfitableToSplit) {
+ if (!splitOptAndCodeGenThin(
+ Task, Conf, TM, AddStream, ThinLTOSplitPartitions, Mod,
+ CombinedIndex, CmdArgs, true, IRAddStream, BitcodeLibFuncs))
+ return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+ } else {
+ // Perform optimization and code generation for ThinLTO.
+ if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
+ /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
+ CmdArgs, BitcodeLibFuncs))
+ return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+
+ // Save the current module before the first codegen round.
+ // Note that the second codegen round runs only `codegen()` without
+ // running `opt()`. We're not reaching here as it's bailed out earlier
+ // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
+ if (IRAddStream)
+ cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
+
+ codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+ }
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
};
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 8fe0476ab1a32..01b44ae2cfa29 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_component_library(LLVMTransformUtils
SizeOpts.cpp
SplitModule.cpp
SplitModuleByCategory.cpp
+ SplitModuleCG.cpp
StripNonLineTableDebugInfo.cpp
SymbolRewriter.cpp
UnifyFunctionExitNodes.cpp
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
new file mode 100644
index 0000000000000..9f57cb3ed566e
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -0,0 +1,26 @@
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "split-module-CG"
+
+void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
+ const llvm::lto::Config &C) {
+ // TODO: 1. Process the linkage of the GlobalValue; 2. Allocate the callgraph
+ // to N partitions; 3.Invoke the cloneModule API to copy the N partitions to
+ // obtain MParts.
+
+}
+
+SplitModuleCG::SplitModuleCG(Module &M,
+ const ModuleSummaryIndex &CombinedIndex,
+ unsigned LimitPartition)
+ : M(M), CG(M), N(LimitPartition) {
+ // TODO: The module is split based on the callgraph, and EntryFuncs stores
+ // the root function of each callgraph.
+
+ if (N == 0 || N > EntryFuncs.size()) {
+ N = EntryFuncs.size();
+ }
+ N = N == 0 ? 1 : N;
+}
>From b80904856475a12f4c6a010c1730324406d7b595 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 15:27:29 +0800
Subject: [PATCH 2/7] [Thinlto][Split] Add callgraph-based module
splitting(SplitModuleCG)
Add a new SplitModuleCG that partitions a module into multiple
parts using function callgraph traversal and cost-based load balancing.
This is intended for use in thinLTO to parallelize code generation by
splitting the module while preserving function call dependencies.
Key features:
- Build a simplified callgraph to track function calls and roots
- Calculate function costs based on IR instruction count
- Partition functions with balanced cost distribution
- Externalize local symbols and rename promoted symbols to avoid
conflicts
- Clone module partitions and emit them in parallel
---
.../llvm/Transforms/Utils/SplitModuleCG.h | 182 ++++++++-
llvm/lib/LTO/LTOBackend.cpp | 10 +
llvm/lib/Transforms/Utils/SplitModuleCG.cpp | 367 +++++++++++++++++-
3 files changed, 552 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
index e60c4e931d40c..956a1ea8030fe 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
@@ -1,6 +1,7 @@
#ifndef LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
#define LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
+#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/LTO/Config.h"
@@ -8,6 +9,169 @@
#include "llvm/ADT/DenseSet.h"
namespace llvm {
+
+class SimplifyCallGraph;
+class SimplifyCallGraphNode;
+
+using CostType = InstructionCost::CostType;
+
+class SimplifyCallGraph {
+ using FunctionMapTy =
+ std::map<const Function *, std::unique_ptr<SimplifyCallGraphNode>>;
+
+ /// A map from \c Function* to \c SimplifyCallGraphNode*.
+ FunctionMapTy FunctionMap;
+
+public:
+ explicit SimplifyCallGraph(CallGraph &CG,
+ const ModuleSummaryIndex &CombinedIndex,
+ Module &M)
+ : CG(CG), M(M) {
+ createSimplifyCallGraph(CombinedIndex);
+ }
+ ~SimplifyCallGraph() {};
+
+ using iterator = FunctionMapTy::iterator;
+ using const_iterator = FunctionMapTy::const_iterator;
+
+ /// Returns the module the call graph corresponds to.
+ inline iterator begin() { return FunctionMap.begin(); }
+ inline iterator end() { return FunctionMap.end(); }
+ inline const_iterator begin() const { return FunctionMap.begin(); }
+ inline const_iterator end() const { return FunctionMap.end(); }
+
+ /// Returns the call graph node for the provided function.
+ inline const SimplifyCallGraphNode *operator[](const Function *F) const {
+ const_iterator I = FunctionMap.find(F);
+ assert(I != FunctionMap.end() && "Function not in callgraph!");
+ return I->second.get();
+ }
+
+ /// Returns the call graph node for the provided function.
+ inline SimplifyCallGraphNode *operator[](const Function *F) {
+ const_iterator I = FunctionMap.find(F);
+ assert(I != FunctionMap.end() && "Function not in callgraph!");
+ return I->second.get();
+ }
+
+ /// Returns the call graph node for the provided function.
+ inline const SimplifyCallGraphNode *at(const Function *F) const {
+ const_iterator I = FunctionMap.find(F);
+ assert(I != FunctionMap.end() && "Function not in callgraph!");
+ return I->second.get();
+ }
+
+ /// Returns the call graph node for the provided function.
+ inline SimplifyCallGraphNode *at(const Function *F) {
+ const_iterator I = FunctionMap.find(F);
+ assert(I != FunctionMap.end() && "Function not in callgraph!");
+ return I->second.get();
+ }
+
+ void createSimplifyCallGraph(const ModuleSummaryIndex &CombinedIndex);
+ void print();
+ SimplifyCallGraphNode *getOrInsertFunction(const Function *F);
+
+private:
+ CallGraph &CG;
+ Module &M;
+};
+
+class SimplifyCallGraphNode {
+public:
+ using CalledFunctionsSet = DenseSet<SimplifyCallGraphNode *>;
+ inline SimplifyCallGraphNode(SimplifyCallGraph *SCG, Function *F)
+ : SCG(SCG), F(F) {}
+
+ SimplifyCallGraphNode(const SimplifyCallGraphNode &) = delete;
+ SimplifyCallGraphNode &operator=(const SimplifyCallGraphNode &) = delete;
+
+ ~SimplifyCallGraphNode() {}
+
+ Function *getFunction() const { return F; }
+
+ unsigned getNumReferences() const { return NumReferences; }
+
+ using iterator = DenseSet<SimplifyCallGraphNode *>::iterator;
+ using const_iterator = DenseSet<SimplifyCallGraphNode *>::const_iterator;
+
+ inline iterator begin() { return CalledFunctions.begin(); }
+ inline iterator end() { return CalledFunctions.end(); }
+ inline const_iterator begin() const { return CalledFunctions.begin(); }
+ inline const_iterator end() const { return CalledFunctions.end(); }
+ inline size_t count(SimplifyCallGraphNode * SCGNode) { return CalledFunctions.count(SCGNode); }
+ inline bool empty() const { return CalledFunctions.empty(); }
+ inline unsigned size() const { return (unsigned)CalledFunctions.size(); }
+
+ void addCalledFunction(SimplifyCallGraphNode *Called) {
+ auto [It, Inserted] = CalledFunctions.insert(Called);
+ if (Inserted)
+ Called->AddRef();
+ }
+
+ void removeCalledFunction(SimplifyCallGraphNode *Called) {
+ auto NumRemoved = CalledFunctions.erase(Called);
+ if (NumRemoved > 0)
+ Called->DropRef();
+ }
+
+private:
+ friend class SimplifyCallGraph;
+
+ SimplifyCallGraph *SCG;
+ Function *F;
+
+ DenseSet<SimplifyCallGraphNode *> CalledFunctions;
+ unsigned NumReferences = 0;
+
+ void DropRef() { --NumReferences; }
+ void AddRef() { ++NumReferences; }
+};
+
+static void addAllDependencies(SimplifyCallGraph &SCG, const Function &F,
+ DenseSet<const Function *> &Fns) {
+ assert(!F.isDeclaration());
+ SmallVector<const Function *> WorkList({&F});
+
+ while (!WorkList.empty()) {
+ const auto &CurFn = *WorkList.pop_back_val();
+ assert(!CurFn.isDeclaration());
+
+ // Scan for an indirect call. If such a call is found, we have to
+ // conservatively assume this can call all non-entrypoint functions in
+ // the module.
+ for (auto &SCGNode : *SCG.at(&CurFn)) {
+ auto *Callee = SCGNode->getFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+ if (Callee != &F)
+ {
+ auto [It, Inserted] = Fns.insert(Callee);
+ if (Inserted)
+ WorkList.push_back(Callee);
+ }
+ }
+ }
+}
+
+struct FunctionWithDependencies {
+ FunctionWithDependencies(SimplifyCallGraph &SCG,
+ const DenseMap<const Function *, CostType> &FnCosts,
+ const Function *F)
+ : F(F) {
+ addAllDependencies(SCG, *F, Dependencies);
+
+ TotalCost = FnCosts.at(F);
+ for (const auto *Dep : Dependencies) {
+ TotalCost += FnCosts.lookup(Dep);
+ }
+ }
+
+ const Function *F = nullptr;
+ DenseSet<const Function *> Dependencies;
+ CostType TotalCost = 0;
+};
+
/// Splits the module M into N linkable partitions. The function ModuleCallback
/// is called N times passing each individual partition as the MPart argument.
class SplitModuleCG {
@@ -21,12 +185,28 @@ class SplitModuleCG {
const llvm::lto::Config &C);
unsigned getPartitionNum() { return N; }
+ StringSet<> &getOriginalExternals() { return OriginalExternals; }
+ StringMap<std::string> &getPromotedRenames() { return PromotedRenames; }
- private:
+private:
unsigned N;
Module &M;
CallGraph CG;
+ std::unique_ptr<SimplifyCallGraph> SCG;
+ CostType ModuleCost;
DenseSet<const Function *> EntryFuncs;
+ StringSet<> OriginalExternals;
+ StringMap<std::string> PromotedRenames;
+ DenseMap<const Function *, bool> externalFunction;
+ DenseMap<const Function *, CostType> FuncsCosts;
+ SmallVector<FunctionWithDependencies> FWDWorkList;
+
+ void calculateFunctionCosts();
+ std::vector<DenseSet<const Function *>> doPartitioning();
+ void dealWithMpart(
+ Module &MPart, unsigned I,
+ function_ref<bool(const GlobalValue *)> NeedsConservativeImport);
+ void createWorkList();
};
} // end namespace llvm
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 11200ade0e8c0..aa1213e5e6af1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -658,6 +658,16 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
IRAddStream);
}
+
+ // Rename the GlobalValues whose internal is changed to external. That's
+ // can avoid duplicate symbols.
+ auto PromotedRenames = SplitModuleCG.getPromotedRenames();
+ for (auto &GV : MPart->global_values()) {
+ if (auto It = PromotedRenames.find(GV.getName());
+ It != PromotedRenames.end()) {
+ GV.setName(It->second);
+ }
+ }
auto splitStream = [&](unsigned task, const Twine &moduleName)
-> Expected<std::unique_ptr<CachedFileStream>> {
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index 9f57cb3ed566e..debdddfb79041 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -1,26 +1,381 @@
#include "llvm/Transforms/Utils/SplitModuleCG.h"
-
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <thread>
using namespace llvm;
#define DEBUG_TYPE "split-module-CG"
+namespace {
+
+static cl::opt<bool> enablePrintSimplifyCallGraph(
+ "enable-print-simplify-callgraph", cl::Hidden, cl::init(false),
+ cl::desc("print SimplifyCallGraph"));
+
+using PartitionID = unsigned;
+
+static void externalize(GlobalValue *GV) {
+ if (GV->hasLocalLinkage()) {
+ GV->setLinkage(GlobalValue::ExternalLinkage);
+ GV->setVisibility(GlobalValue::HiddenVisibility);
+ }
+
+ // Unnamed entities must be named consistently between modules. setName will
+ // give a distinct name to each such entity.
+ if (!GV->hasName())
+ GV->setName("__llvmsplit_unnamed");
+}
+
+} // namespace
+
+std::vector<DenseSet<const Function *>> SplitModuleCG::doPartitioning() {
+ LLVM_DEBUG(dbgs() << "\n--Partitioning Starts--\n");
+ // Performs all of the partitioning work on M.
+ std::vector<DenseSet<const Function *>> Partitions;
+ Partitions.resize(N);
+ if (N == 0)
+ return Partitions;
+
+ auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
+ const std::pair<PartitionID, CostType> &b) {
+ // When two partitions have the same cost, assign to the one with the
+ // biggest ID first. This allows us to put things in P0 last, because P0 may
+ // have other stuff added later.
+ if (a.second == b.second)
+ return a.first < b.first;
+ return a.second > b.second;
+ };
+
+ std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
+ for (unsigned I = 0; I < N; ++I)
+ BalancingQueue.emplace_back(I, 0);
+
+ // Helper function to handle assigning a function to a partition. This takes
+ // care of updating the balancing queue.
+ const auto AssignToPartition = [&](PartitionID PID,
+ const FunctionWithDependencies &FWD) {
+ auto &FnsInPart = Partitions[PID];
+ FnsInPart.insert(FWD.F);
+ for (const Function *Dep : FWD.Dependencies) {
+ FnsInPart.insert(Dep);
+ }
+
+ // Update the balancing queue. we scan backwards because in the common case
+ // the partition is at the end.
+ for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
+ if (QueuePID == PID) {
+ CostType NewCost = 0;
+ for (auto *Fn : Partitions[PID])
+ NewCost += FuncsCosts.at(Fn);
+ Cost = NewCost;
+ }
+ }
+
+ sort(BalancingQueue, ComparePartitions);
+ };
+
+ for (auto &CurFn : FWDWorkList) {
+ // Normal "load-balancing", assign to partition with least pressure.
+ auto [PID, CurCost] = BalancingQueue.back();
+ AssignToPartition(PID, CurFn);
+ }
+
+ return Partitions;
+}
+
+void SplitModuleCG::calculateFunctionCosts() {
+ ModuleCost = 0;
+ for (auto &Fn : M) {
+ if (Fn.isDeclaration())
+ continue;
+
+ CostType FnCost = 0;
+ for (const auto &BB : Fn) {
+ CostType CostVal = std::distance(BB.begin(), BB.end());
+ FnCost += CostVal;
+ }
+ assert(FnCost != 0);
+ FuncsCosts[&Fn] = FnCost;
+ assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
+ ModuleCost += FnCost;
+ }
+}
+
+void SplitModuleCG::dealWithMpart(Module &MPart, unsigned I,
+ function_ref<bool(const GlobalValue *)> NeedsConservativeImport) {
+ // collect symbols to rename
+ auto checkPromoted = [&](const GlobalValue &GV) {
+ // now is external (not local), but not in external set.
+ if (!GV.hasLocalLinkage() && !OriginalExternals.contains(GV.getName())) {
+ if (PromotedRenames.count(GV.getName()))
+ return;
+ MD5 Hash;
+ Hash.update(M.getModuleIdentifier());
+ MD5::MD5Result Result;
+ Hash.final(Result);
+ SmallString<32> HashStr;
+ MD5::stringifyResult(Result, HashStr);
+ std::string NewName = (GV.getName() + "." + HashStr.str().substr(0, 8)).str();
+ PromotedRenames[GV.getName()] = NewName;
+ }
+ };
+
+ auto AvailableExternalizeFunc = [&](llvm::Function &Func) {
+ Func.setLinkage(GlobalValue::AvailableExternallyLinkage);
+ Func.setComdat(nullptr);
+ };
+
+ for (const auto &GV : MPart.global_values())
+ checkPromoted(GV);
+ // Clean-up conservatively imported GVs without any users.
+ for (auto &GV : make_early_inc_range(MPart.globals())) {
+ if (NeedsConservativeImport(&GV) && GV.use_empty())
+ GV.eraseFromParent();
+ }
+
+ for (auto &func : MPart.functions()) {
+ auto Fn = M.getFunction(func.getName());
+ if (externalFunction.count(Fn) && !func.isDeclaration()) {
+ if (!externalFunction[Fn]) {
+ AvailableExternalizeFunc(func);
+ } else {
+ externalFunction[Fn] = false;
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << MPart.getModuleIdentifier() << " : \n");
+ for (auto &F : MPart) {
+ if (!F.isDeclaration())
+ LLVM_DEBUG(dbgs() << " [Function: ] " << I << " " << F.getName() << " "
+ << F.getLinkage() << "\n");
+ }
+}
+
+void SplitModuleCG::createWorkList() {
+ // First, find all the entry functions with an in-degree of 0
+ // (i.e., those that are not called by any function).
+ for (auto &NodePair : *SCG) {
+ SimplifyCallGraphNode *SCGNode = NodePair.second.get();
+ Function *F = SCGNode->getFunction();
+ if (F && SCGNode->getNumReferences() == 0) {
+ EntryFuncs.insert(F);
+ }
+ }
+
+ // Second, find all the dependencies of each entry function.
+ for (auto *F : EntryFuncs) {
+ FWDWorkList.emplace_back(*SCG, FuncsCosts, F);
+ }
+
+ // Third, find all the functions that are not in the worklist.
+ DenseSet<const Function *> SeenFunctions;
+ for (const auto &FWD : FWDWorkList) {
+ SeenFunctions.insert(FWD.F);
+ SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ }
+ for (auto &F : M) {
+ // This function may be in a ring, and therefore is not a dependency of
+ // any root, which is treated as a root function here.
+ if (!F.isDeclaration() && !SeenFunctions.count(&F)) {
+ FWDWorkList.emplace_back(*SCG, FuncsCosts, &F);
+ auto &FWD = FWDWorkList.back();
+ EntryFuncs.insert(&F);
+ SeenFunctions.insert(FWD.F);
+ SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ }
+ }
+
+ // Sort the worklist so the most expensive roots are seen first.
+ sort(FWDWorkList, [&](auto &A, auto &B) {
+ // Sort by total cost, and if the total cost is identical, sort
+ // alphabetically
+ if (A.TotalCost == B.TotalCost)
+ return A.F->getName() < B.F->getName();
+ return A.TotalCost > B.TotalCost;
+ });
+
+ LLVM_DEBUG(dbgs() << "Number of callgraphs to be allocated: "
+ << FWDWorkList.size() << " Module cost: "
+ << ModuleCost << "\n");
+ LLVM_DEBUG(dbgs() << "callgraphs: \n");
+ for (auto FWD : FWDWorkList) {
+ LLVM_DEBUG(dbgs() << "[root] " << FWD.F->getName() << " (totalCost:"
+ << FWD.TotalCost << "; root function cost: "
+ << FuncsCosts[FWD.F] << "; has dependency: "
+ << FWD.Dependencies.size() << "\n");
+ }
+}
+
void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
const llvm::lto::Config &C) {
- // TODO: 1. Process the linkage of the GlobalValue; 2. Allocate the callgraph
- // to N partitions; 3.Invoke the cloneModule API to copy the N partitions to
- // obtain MParts.
+ for (Function &F : M) {
+ if (F.hasLocalLinkage() && F.hasOneUse() && !F.hasAddressTaken())
+ continue;
+ externalize(&F);
+ if (!F.isDeclaration() &&
+ (F.hasExternalLinkage() || !F.isDefinitionExact()))
+ externalFunction[&F] = true;
+ }
+ for (GlobalVariable &GV : M.globals())
+ externalize(&GV);
+ for (GlobalAlias &GA : M.aliases())
+ externalize(&GA);
+ for (GlobalIFunc &GI : M.ifuncs())
+ externalize(&GI);
+ // TODO: Consider optimizing the alias, replacing the determined alias with
+ // the determined aliasee.
+
+ // Assign callgraphs into N partitions.
+ auto Partitions = doPartitioning();
+ assert(Partitions.size() == N);
+
+ // local GVs need to be conservatively imported into [dependency] every module,
+ // and then cleaned up afterwards.
+ const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
+ // We conservatively import private/internal GVs into every module and clean
+ // them up afterwards.
+ const auto *Var = dyn_cast<GlobalVariable>(GV);
+ return Var && Var->hasLocalLinkage();
+ };
+
+ auto ShouldCloneDefinition = [&](unsigned I, const GlobalValue *GV) {
+ const auto &FnsInPart = Partitions[I];
+
+ // Functions go in their assigned partition.
+ if (const auto *newFn = dyn_cast<Function>(GV)) {
+ const auto *Fn = M.getFunction(newFn->getName());
+ return FnsInPart.contains(Fn);
+ }
+ if (NeedsConservativeImport(GV))
+ return true;
+ // Everything else goes in the first partition.
+ return I == 0;
+ };
+
+ // TODO: In the future, it may be considered to also include clonemodule in
+ // parallel to reduce compilation time.
+ std::vector<std::thread> Threads;
+ Threads.reserve(N);
+ std::vector<std::unique_ptr<Module>> MPartInCtxs;
+ MPartInCtxs.resize(N);
+ for (unsigned I = 0; I < N; ++I) {
+ ValueToValueMapTy VMap;
+ std::unique_ptr<Module> MPart(
+ CloneModule(M, VMap, [&](const GlobalValue *GV) {
+ return ShouldCloneDefinition(I, GV);
+ }));
+
+ dealWithMpart(*MPart, I, NeedsConservativeImport);
+
+ // If not clone module in multi-thread, we also need to clone
+ // the module obtained through segmentation into a new context
+ // to avoid data races.
+ SmallString<0> BC;
+ raw_svector_ostream BCOS(BC);
+ WriteBitcodeToFile(*MPart, BCOS);
+ MPart.reset();
+ Threads.emplace_back([&, I](SmallString<0> BC) {
+ llvm::lto::LTOLLVMContext Ctx(C);
+ Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+ MemoryBufferRef(BC.str(), "ld-temp.o"), Ctx);
+ BC = SmallString<0>();
+ if (!MOrErr)
+ report_fatal_error("Failed to read bitcode");
+ ModuleCallback(std::move(MOrErr.get()), I);
+ }, std::move(BC));
+ }
+ for (auto &T : Threads)
+ T.join();
}
SplitModuleCG::SplitModuleCG(Module &M,
const ModuleSummaryIndex &CombinedIndex,
unsigned LimitPartition)
: M(M), CG(M), N(LimitPartition) {
- // TODO: The module is split based on the callgraph, and EntryFuncs stores
- // the root function of each callgraph.
+ // Track existing non-local symbols. This ensures that when we promote
+ // internal symbols to external for partitioning, we can handle renaming
+ // and avoid conflicts.
+ for (const auto &GV : M.global_values())
+ if (!GV.hasLocalLinkage())
+ OriginalExternals.insert(GV.getName());
+
+ calculateFunctionCosts();
+
+ // Construct a simplified call graph to facilitate worklist generation.
+ SCG = std::make_unique<SimplifyCallGraph>(CG, CombinedIndex, M);
+ // TODO: When the SCG is established, the special cases of comdat and
+ // initarray need to be considered.
+
+ // Populate the worklist with root functions and their transitive
+ // dependencies. This worklist serves as the foundation for the
+ // subsequent module partitioning.
+ createWorkList();
if (N == 0 || N > EntryFuncs.size()) {
N = EntryFuncs.size();
}
N = N == 0 ? 1 : N;
}
+
+void SimplifyCallGraph::createSimplifyCallGraph(
+ const ModuleSummaryIndex &CombinedIndex) {
+ for (auto &NodePair : CG) {
+ CallGraphNode *CGNode = NodePair.second.get();
+ Function *F = CGNode->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ SimplifyCallGraphNode *SCGNode = getOrInsertFunction(F);
+
+ //TODO: Trace indirect call usage for the current function.
+
+ for (const auto &CGNodeItem : *CGNode) {
+ Function *Called = CGNodeItem.second->getFunction();
+ if (!Called) {
+ //TODO: Deal with indirect call.
+ // 1. Check if the instruction has a callees metadata.
+ // 2. Check if this is an indirect call with profile data.
+ // 3. Check if this is an alias to a function.
+ }
+ if (!Called || Called->isDeclaration())
+ continue;
+ SCGNode->addCalledFunction(getOrInsertFunction(Called));
+ }
+ }
+
+ if (enablePrintSimplifyCallGraph)
+ print();
+}
+
+
+void SimplifyCallGraph::print() {
+ for (auto &SCGItem : FunctionMap) {
+ LLVM_DEBUG(dbgs() << "Call graph node for function: '"
+ << SCGItem.first->getName() << "' #uses="
+ << SCGItem.second->getNumReferences() << "\n");
+
+ for (const auto &callee : *SCGItem.second) {
+ LLVM_DEBUG(dbgs() <<" Calls function : '"
+ << callee->getFunction()->getName() << " '\n");
+ }
+ }
+}
+
+SimplifyCallGraphNode *
+SimplifyCallGraph::getOrInsertFunction(const Function *F) {
+ auto &SCGN = FunctionMap[F];
+ if (SCGN)
+ return SCGN.get();
+
+ SCGN =
+ std::make_unique<SimplifyCallGraphNode>(this, const_cast<Function *>(F));
+ return SCGN.get();
+}
>From 88db8d4e7fbcadc73e1c48c23bb8781b2c21df4f Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 15:57:13 +0800
Subject: [PATCH 3/7] [llvm-split][SplitModuleCG] Add support for SplitModuleCG
Add a new command line option --enable-split-module-CG to llvm-split
tool for testing the SplitModuleCG utility.
The change:
- Adds --enable-split-module-CG flag
- Wire up the SplitModuleCG interface in llvm-split
---
.../SplitModuleCG/split-promoted-rename.ll | 41 +++++++++++++++++++
.../SplitModuleCG/function-with-ring.ll | 36 ++++++++++++++++
.../llvm-split/SplitModuleCG/function.ll | 35 ++++++++++++++++
.../llvm-split/SplitModuleCG/partition-cap.ll | 10 +++++
.../SplitModuleCG/single-partition.ll | 13 ++++++
.../tools/llvm-split/SplitModuleCG/unnamed.ll | 8 ++++
llvm/tools/llvm-split/llvm-split.cpp | 36 ++++++++++++++++
7 files changed, 179 insertions(+)
create mode 100644 llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/function.ll
create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll
diff --git a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll b/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
new file mode 100644
index 0000000000000..6c51141a9ad85
--- /dev/null
+++ b/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
@@ -0,0 +1,41 @@
+; Test that internal symbols promoted during module splitting are consistently
+; renamed with an MD5 suffix across all partitions.
+;
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: llvm-lto2 run %t.bc -o %t \
+; RUN: -thinlto-split=true \
+; RUN: -thinlto-split-partitions=2 -thinlto-split-module-size-threshold=0 \
+; RUN: -r=%t.bc,caller_a,px \
+; RUN: -r=%t.bc,caller_b,px
+; RUN: llvm-nm %t.1 | FileCheck %s
+
+; CHECK-DAG: T caller_a
+; CHECK-DAG: T caller_b
+; CHECK: T {{.*promoted_internal[._][0-9a-f]+.*}}
+; CHECK-NOT: T promoted_internal{{$}}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; @promoted_internal is internal. SplitModuleCG::dealWithMpart's checkPromoted
+; records it in PromotedRenames. splitOptAndCodeGenThin applies the rename
+; after opt via:
+; for (auto &GV : MPart->global_values())
+; if (auto It = PromotedRenames.find(GV.getName()); ...)
+; GV.setName(It->second);
+define internal void @promoted_internal() {
+entry:
+ ret void
+}
+
+define void @caller_a() {
+entry:
+ call void @promoted_internal()
+ ret void
+}
+
+define void @caller_b() {
+entry:
+ call void @promoted_internal()
+ ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll b/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
new file mode 100644
index 0000000000000..f2fc8c03c922a
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+
+; CHECK0-DAG: declare void @foo()
+; CHECK0-DAG: define void @bar()
+; CHECK0-DAG: declare void @call_foo()
+; CHECK0-DAG: define void @call_bar()
+
+; CHECK1-DAG: define void @foo()
+; CHECK1-DAG: declare void @bar()
+; CHECK1-DAG: define void @call_foo()
+; CHECK1-DAG: declare void @call_bar()
+
+define void @foo() {
+entry:
+ call void @call_foo()
+ ret void
+}
+
+define void @bar() {
+entry:
+ ret void
+}
+
+define void @call_foo() {
+entry:
+ call void @foo()
+ ret void
+}
+
+define void @call_bar() {
+entry:
+ call void @bar()
+ ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/function.ll b/llvm/test/tools/llvm-split/SplitModuleCG/function.ll
new file mode 100644
index 0000000000000..ddf5bb5c3dff3
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/function.ll
@@ -0,0 +1,35 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+
+; CHECK0-DAG: declare dso_local void @foo()
+; CHECK0-DAG: define void @bar()
+; CHECK0-DAG: declare void @func_a()
+; CHECK0-DAG: define void @func_b()
+; CHECK1-DAG: define internal void @foo()
+; CHECK1-DAG: define available_externally void @bar()
+; CHECK1-DAG: define void @func_a()
+; CHECK1-DAG: declare void @func_b()
+
+define internal void @foo() {
+entry:
+ ret void
+}
+
+define void @bar() {
+entry:
+ ret void
+}
+
+define void @func_a() {
+entry:
+ call void @foo()
+ call void @bar()
+ ret void
+}
+
+define void @func_b() {
+entry:
+ call void @bar()
+ ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll b/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
new file mode 100644
index 0000000000000..5c3ced3e682af
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-split -enable-split-module-CG=true -j10 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; should only produce 2 output files (N capped to EntryFuncs.size()=2)
+
+; CHECK0: define void @foo()
+; CHECK1: define void @bar()
+
+define void @foo() { ret void }
+define void @bar() { ret void }
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll b/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
new file mode 100644
index 0000000000000..fdfdf910a3498
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-split -enable-split-module-CG=true -j1 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+
+; CHECK0: define void @foo()
+; CHECK0: define void @bar()
+
+define void @foo() {
+ call void @bar()
+ ret void
+}
+define void @bar() {
+ ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll b/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll
new file mode 100644
index 0000000000000..73f7079669c55
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+
+; CHECK0-DAG: define hidden void @__llvmsplit_unnamed()
+
+define internal void @0() {
+ ret void
+}
\ No newline at end of file
diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp
index 4cc4fd945fc53..4156222855617 100644
--- a/llvm/tools/llvm-split/llvm-split.cpp
+++ b/llvm/tools/llvm-split/llvm-split.cpp
@@ -18,8 +18,10 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/PassInstrumentation.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
+#include "llvm/LTO/Config.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
@@ -35,6 +37,7 @@
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/Utils/SplitModule.h"
#include "llvm/Transforms/Utils/SplitModuleByCategory.h"
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
using namespace llvm;
@@ -76,6 +79,10 @@ static cl::opt<std::string>
static cl::opt<std::string>
MCPU("mcpu", cl::desc("Target CPU, ignored if --mtriple is not used"),
cl::value_desc("cpu"), cl::cat(SplitCategory));
+
+static cl::opt<bool>
+ EnableSplitModuleCG("enable-split-module-CG", cl::Prefix, cl::init(false),
+ cl::desc("Split module using call graph"), cl::cat(SplitCategory));
enum class SplitByCategoryType {
SBCT_ByAttribute,
@@ -327,6 +334,35 @@ int main(int argc, char **argv) {
"splitModule implementation\n";
}
+ if (EnableSplitModuleCG) {
+ const auto HandleModulePartCG = [&](std::unique_ptr<Module> MPart, unsigned I) {
+ std::error_code EC;
+ std::unique_ptr<ToolOutputFile> Out(
+ new ToolOutputFile(OutputFilename + utostr(I), EC, sys::fs::OF_None));
+ if (EC) {
+ errs() << EC.message() << '\n';
+ exit(1);
+ }
+
+ if (verifyModule(*MPart, &errs())) {
+ errs() << "Broken module!\n";
+ exit(1);
+ }
+
+ WriteBitcodeToFile(*MPart, Out->os());
+
+ // Declare success.
+ Out->keep();
+ };
+
+ llvm::lto::Config Config;
+ ModuleSummaryIndex CombinedIndex(false);
+ SplitModuleCG SplitModuleCG(*M, CombinedIndex, NumOutputs);
+ SplitModuleCG.SplitModule(HandleModulePartCG, Config);
+ return 0;
+ }
+
SplitModule(*M, NumOutputs, HandleModulePart, PreserveLocals, RoundRobin);
return 0;
}
+
>From 065dd31ca935fdb3b36a6272f2b4095e64195fd2 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Fri, 29 May 2026 14:56:19 +0800
Subject: [PATCH 4/7] [Thinlto][SplitModuleCG] Fix Windows compile of closeFile
Remove unused 'MergedFD', sys::fs::OpenFlags Flags) without "ResultDF".
---
llvm/lib/LTO/LTOBackend.cpp | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index aa1213e5e6af1..ef78f1fa8ac3a 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -705,13 +705,10 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
if (!FinalStream)
report_fatal_error("Failed to open final output stream");
- int MergedFD;
SmallString<128> MergedFilename;
- if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFD,
- MergedFilename))
+ if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFilename))
report_fatal_error("Failed to create merged temp file.");
llvm::FileRemover MergedFileRemover(MergedFilename);
- sys::fs::closeFile(MergedFD);
std::vector<StringRef> Args;
std::string LinkerPath = "";
>From 5334e7269d4c9fd00c5a3e31c213b84feeae480d Mon Sep 17 00:00:00 2001
From: JiangNing <jiangninghx at foxmail.com>
Date: Sat, 23 May 2026 17:30:55 +0800
Subject: [PATCH 5/7] [ThinLTO][Driver] Move split-codegen linker invocation
out of LTOBackend
This patch fixes a layering violation by moving the partition merging
step (`ld -r`) out of `LTOBackend` and into the Clang Driver.
- `BackendUtil` now outputs an `@rsp` file containing the split partitions.
- `ThinLTOMergeJobAction` is added to the Driver to invoke `ld.lld -r` using the `@rsp` file.
- `AcceptsMultipleOutputsPerTask` is added to `lto::Config` to protect unaware LTO clients from data races.
---
.../clang/Basic/DiagnosticDriverKinds.td | 2 +
clang/include/clang/Driver/Action.h | 14 +-
clang/include/clang/Driver/CommonArgs.h | 13 +
clang/include/clang/Driver/Job.h | 26 ++
clang/include/clang/Driver/Tool.h | 1 +
clang/include/clang/Driver/ToolChain.h | 2 +
.../include/clang/Frontend/FrontendOptions.h | 3 +
clang/include/clang/Options/Options.td | 3 +
clang/lib/CodeGen/BackendUtil.cpp | 108 +++++++-
clang/lib/Driver/Action.cpp | 7 +
clang/lib/Driver/Driver.cpp | 24 +-
clang/lib/Driver/Job.cpp | 44 +++
clang/lib/Driver/ToolChain.cpp | 15 ++
clang/lib/Driver/ToolChains/Clang.cpp | 11 +
clang/lib/Driver/ToolChains/CommonArgs.cpp | 32 +++
clang/lib/Driver/ToolChains/Gnu.cpp | 41 +++
clang/lib/Driver/ToolChains/Gnu.h | 1 +
.../thinlto-split/split-output-list-dwo.ll | 169 ++++++++++++
.../thinlto-split/split-output-list.ll | 75 ++++++
.../thinlto-split}/split-promoted-rename.ll | 24 +-
.../thinlto-split-merge-realistic.ll | 251 ++++++++++++++++++
clang/test/Driver/thinlto-split-merge.c | 64 +++++
lld/ELF/LTO.cpp | 18 +-
llvm/include/llvm/LTO/Config.h | 17 ++
llvm/include/llvm/LTO/LTO.h | 5 +
llvm/include/llvm/LTO/LTOBackend.h | 8 +
llvm/lib/LTO/LTO.cpp | 81 +++++-
llvm/lib/LTO/LTOBackend.cpp | 219 ++++++---------
llvm/lib/Transforms/Utils/SplitModuleCG.cpp | 6 +
29 files changed, 1124 insertions(+), 160 deletions(-)
create mode 100644 clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
create mode 100644 clang/test/CodeGen/thinlto-split/split-output-list.ll
rename {llvm/test/Transforms/SplitModuleCG => clang/test/CodeGen/thinlto-split}/split-promoted-rename.ll (58%)
create mode 100644 clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
create mode 100644 clang/test/Driver/thinlto-split-merge.c
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 020014dabacfd..1d2f7e5832a01 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -181,6 +181,8 @@ def warn_openmp_spec_incomplete : Warning<
InGroup<ExperimentalOption>;
def err_drv_invalid_thread_model_for_target : Error<
"invalid thread model '%0' in '%1' for this target">;
+def err_drv_lto_split_requires_lld : Error<
+ "cannot find 'ld.lld' required for ThinLTO split codegen at '%0'">;
def err_drv_invalid_linker_name : Error<
"invalid linker name in argument '%0'">;
def err_drv_invalid_rtlib_name : Error<
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 67937b00f6bcf..b4e789bf15102 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -77,9 +77,10 @@ class Action {
BinaryAnalyzeJobClass,
BinaryTranslatorJobClass,
ObjcopyJobClass,
+ ThinLTOMergeJobClass,
JobClassFirst = PreprocessJobClass,
- JobClassLast = ObjcopyJobClass
+ JobClassLast = ThinLTOMergeJobClass
};
// The offloading kind determines if this action is binded to a particular
@@ -519,6 +520,17 @@ class LinkJobAction : public JobAction {
}
};
+class ThinLTOMergeJobAction : public JobAction {
+ void anchor() override;
+
+public:
+ ThinLTOMergeJobAction(ActionList &Inputs, types::ID Type);
+
+ static bool classof(const Action *A) {
+ return A->getKind() == ThinLTOMergeJobClass;
+ }
+};
+
class LipoJobAction : public JobAction {
void anchor() override;
diff --git a/clang/include/clang/Driver/CommonArgs.h b/clang/include/clang/Driver/CommonArgs.h
index 0af1b89425227..e69a54b54eefe 100644
--- a/clang/include/clang/Driver/CommonArgs.h
+++ b/clang/include/clang/Driver/CommonArgs.h
@@ -158,6 +158,19 @@ bool isObjCAutoRefCount(const llvm::opt::ArgList &Args);
llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args,
const Driver &D);
+bool isThinLTOSplitEnabled(const llvm::opt::ArgList &Args);
+
+/// Response-file path listing the partition objects for cc1 output \p Output.
+/// Written by cc1 (-thinlto-split-output-list) and read by ThinLTOMergeJobAction
+/// (`ld.lld -r @<file>`); shared so both agree on the name.
+std::string getThinLTOSplitResponseFile(llvm::StringRef Output);
+
+/// Single gating predicate (shared by the cc1 flag and the merge action) for
+/// whether the driver splits a distributed ThinLTO compile and merges it with
+/// `ld.lld -r`.
+bool isThinLTOSplitMergeEnabled(const ToolChain &TC,
+ const llvm::opt::ArgList &Args);
+
bool areOptimizationsEnabled(const llvm::opt::ArgList &Args);
bool isUseSeparateSections(const llvm::Triple &Triple);
diff --git a/clang/include/clang/Driver/Job.h b/clang/include/clang/Driver/Job.h
index 116254f79ae6f..b8e550fc19968 100644
--- a/clang/include/clang/Driver/Job.h
+++ b/clang/include/clang/Driver/Job.h
@@ -264,6 +264,32 @@ class CC1Command : public Command {
void setEnvironment(llvm::ArrayRef<const char *> NewEnvironment) override;
};
+/// Merges the per-partition objects from ThinLTO split codegen into one
+/// relocatable object (`ld.lld -r`). The partition count is only known at
+/// codegen time, so after the merge this reads the response file to remove
+/// them (unless -save-temps); the response file is a normal Compilation temp.
+class ThinLTOMergeCommand : public Command {
+ /// Response file listing the partition objects to merge.
+ std::string SplitOutputList;
+
+ /// Whether to remove the partition objects after a successful merge (false
+ /// under -save-temps).
+ bool CleanupSplitOutputs;
+
+ void cleanupSplitOutputs() const;
+
+public:
+ ThinLTOMergeCommand(const Action &Source, const Tool &Creator,
+ ResponseFileSupport ResponseSupport,
+ const char *Executable,
+ const llvm::opt::ArgStringList &Arguments,
+ ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs,
+ StringRef SplitOutputList, bool CleanupSplitOutputs);
+
+ int Execute(ArrayRef<std::optional<StringRef>> Redirects, std::string *ErrMsg,
+ bool *ExecutionFailed) const override;
+};
+
/// JobList - A sequence of jobs to perform.
class JobList {
public:
diff --git a/clang/include/clang/Driver/Tool.h b/clang/include/clang/Driver/Tool.h
index 42cf99a4a9703..e09583c13f42d 100644
--- a/clang/include/clang/Driver/Tool.h
+++ b/clang/include/clang/Driver/Tool.h
@@ -56,6 +56,7 @@ class Tool {
virtual bool canEmitIR() const { return false; }
virtual bool hasIntegratedCPP() const = 0;
virtual bool isLinkJob() const { return false; }
+ virtual bool canConstructThinLTOMergeJob() const { return false; }
virtual bool isDsymutilJob() const { return false; }
/// Does this tool have "good" standardized diagnostics, or should the
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 684ef52d8532b..048ae490f05e9 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -429,6 +429,8 @@ class ToolChain {
/// a compiler other than Clang.
virtual Tool *SelectTool(const JobAction &JA) const;
+ bool canConstructThinLTOMergeJob() const;
+
// Helper methods
std::string GetFilePath(const char *Name) const;
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 2f75fba566dfb..4fb2c41bedb1e 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -445,6 +445,9 @@ class FrontendOptions {
/// The output file, if any.
std::string OutputFile;
+ /// Response file listing objects emitted by ThinLTO split codegen.
+ std::string ThinLTOSplitOutputList;
+
/// If given, the new suffix for fix-it rewritten files.
std::string FixItSuffix;
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 6fc8806ba683c..88250c5944954 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -3413,6 +3413,9 @@ def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
Visibility<[ClangOption, CLOption, CC1Option]>, Group<f_Group>,
HelpText<"Perform ThinLTO importing using provided function summary index">;
+def thinlto_split_output_list_EQ : Joined<["-"], "thinlto-split-output-list=">,
+ Visibility<[CC1Option]>, Flags<[HelpHidden]>,
+ MarshallingInfoString<FrontendOpts<"ThinLTOSplitOutputList">>;
def fthin_link_bitcode_EQ : Joined<["-"], "fthin-link-bitcode=">,
Visibility<[ClangOption, CLOption, CC1Option]>, Group<f_Group>,
HelpText<"Write minimized bitcode to <file> for the ThinLTO thin link only">,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index a46a25c4492f2..e198f08882804 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -17,6 +17,7 @@
#include "clang/Frontend/Utils.h"
#include "clang/Lex/HeaderSearchOptions.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/RuntimeLibcallInfo.h"
@@ -54,6 +55,7 @@
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/Path.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -92,7 +94,9 @@
#include "llvm/Transforms/Utils/Debugify.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <limits>
+#include <map>
#include <memory>
+#include <mutex>
#include <optional>
using namespace clang;
using namespace llvm;
@@ -1307,6 +1311,29 @@ void EmitAssemblyHelper::emitAssembly(BackendAction Action,
DwoOS->keep();
}
+static std::string getThinLTOSplitOutputFile(const FrontendOptions &Opts,
+ size_t Task) {
+ return (Twine(Opts.OutputFile) + ".thinlto-split." + Twine(Task) + ".o")
+ .str();
+}
+
+static bool writeThinLTOSplitOutputList(DiagnosticsEngine &Diags,
+ StringRef OutputList,
+ ArrayRef<std::string> Outputs) {
+ std::error_code EC;
+ raw_fd_ostream OS(OutputList, EC, sys::fs::OF_Text);
+ if (EC) {
+ Diags.Report(diag::err_fe_unable_to_open_output)
+ << OutputList << EC.message();
+ return false;
+ }
+ for (StringRef Output : Outputs) {
+ sys::printArg(OS, Output, /*Quote=*/true);
+ OS << '\n';
+ }
+ return true;
+}
+
static void
runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
llvm::Module *M, std::unique_ptr<raw_pwrite_stream> OS,
@@ -1329,11 +1356,35 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
if (!lto::initImportList(*M, *CombinedIndex, ImportList))
return;
- auto AddStream = [&](size_t Task, const Twine &ModuleName) {
+ const std::string &SplitOutputList =
+ CI.getFrontendOpts().ThinLTOSplitOutputList;
+ std::map<size_t, std::string> SplitOutputMap;
+ std::mutex SplitOutputFilesMutex;
+
+ auto AddStream = [&](size_t Task, const Twine &/*ModuleName*/)
+ -> Expected<std::unique_ptr<CachedFileStream>> {
+ if (!SplitOutputList.empty()) {
+ std::unique_ptr<raw_pwrite_stream> OutputOS;
+ std::string OutputPath =
+ getThinLTOSplitOutputFile(CI.getFrontendOpts(), Task);
+ {
+ std::lock_guard<std::mutex> Lock(SplitOutputFilesMutex);
+ SplitOutputMap[Task] = OutputPath;
+ }
+
+ std::error_code EC;
+ OutputOS =
+ std::make_unique<raw_fd_ostream>(OutputPath, EC, sys::fs::OF_None);
+ if (EC)
+ return errorCodeToError(EC);
+ return std::make_unique<CachedFileStream>(std::move(OutputOS),
+ OutputPath);
+ }
return std::make_unique<CachedFileStream>(std::move(OS),
CGOpts.ObjectFilenameForDebug);
};
lto::Config Conf;
+ Conf.AcceptsMultipleOutputsPerTask = !SplitOutputList.empty();
if (CGOpts.SaveTempsFilePrefix != "") {
if (Error E = Conf.addSaveTemps(CGOpts.SaveTempsFilePrefix + ".",
/* UseInputModulePath */ false)) {
@@ -1384,6 +1435,16 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
Conf.RemarksFormat = CGOpts.OptRecordFormat;
Conf.SplitDwarfFile = CGOpts.SplitDwarfFile;
Conf.SplitDwarfOutput = CGOpts.SplitDwarfOutput;
+ // Split partitions need distinct .dwo files for both split and single
+ // fission modes.
+ if (!SplitOutputList.empty() && !CGOpts.SplitDwarfFile.empty()) {
+ SmallString<128> DwoStem(CGOpts.SplitDwarfOutput.empty()
+ ? CGOpts.SplitDwarfFile
+ : CGOpts.SplitDwarfOutput);
+ if (llvm::sys::path::extension(DwoStem) == ".dwo")
+ llvm::sys::path::replace_extension(DwoStem, "");
+ Conf.SplitDwarfOutputStem = std::string(DwoStem);
+ }
for (auto &Plugin : CI.getPassPlugins())
Conf.LoadedPassPlugins.push_back(Plugin.get());
switch (Action) {
@@ -1423,6 +1484,14 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
});
}
+
+ if (!SplitOutputList.empty()) {
+ SmallVector<std::string, 0> Outputs;
+ for (const auto &Pair : SplitOutputMap)
+ Outputs.push_back(Pair.second);
+ if (!writeThinLTOSplitOutputList(Diags, SplitOutputList, Outputs))
+ return;
+ }
}
void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
@@ -1434,6 +1503,14 @@ void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
llvm::TimeTraceScope TimeScope("Backend");
DiagnosticsEngine &Diags = CI.getDiagnostics();
+ // When split codegen is active, cc1's -o stream should never receive
+ // partition content. The driver owns the final -o via ld.lld -r, and each
+ // partition is written to its own .thinlto-split.N.o file by the AddStream
+ // callback in runThinLTOBackend. Replace the original OS with a null stream
+ // to avoid opening or writing the user-specified -o path at all.
+ if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty())
+ OS = std::make_unique<raw_null_ostream>();
+
std::unique_ptr<llvm::Module> EmptyModule;
if (!CGOpts.ThinLTOIndexFile.empty()) {
// FIXME(sandboxing): Figure out how to support distributed indexing.
@@ -1474,9 +1551,38 @@ void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
}
}
+ // When split codegen is active, the skip/fallback path must emit its object
+ // to the partition-0 filename instead of the original -o. The original OS
+ // was replaced with a null stream above.
+ if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty()) {
+ std::string FallbackOutputPath =
+ getThinLTOSplitOutputFile(CI.getFrontendOpts(), 0);
+ std::error_code EC;
+ auto FallbackOS = std::make_unique<raw_fd_ostream>(FallbackOutputPath, EC,
+ sys::fs::OF_None);
+ if (EC) {
+ Diags.Report(diag::err_fe_unable_to_open_output)
+ << FallbackOutputPath << EC.message();
+ return;
+ }
+ OS = std::move(FallbackOS);
+ }
+
EmitAssemblyHelper AsmHelper(CI, CGOpts, M, VFS);
AsmHelper.emitAssembly(Action, std::move(OS), BC);
+ if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty()) {
+ // If distributed ThinLTO indexing skips this backend, runThinLTOBackend is
+ // bypassed. Keep the driver merge action valid by listing the object
+ // emitted above. The real split path writes only partition objects.
+ std::string FallbackOutputPath =
+ getThinLTOSplitOutputFile(CI.getFrontendOpts(), 0);
+ if (!writeThinLTOSplitOutputList(
+ Diags, CI.getFrontendOpts().ThinLTOSplitOutputList,
+ ArrayRef<std::string>(&FallbackOutputPath, 1)))
+ return;
+ }
+
// Verify clang's TargetInfo DataLayout against the LLVM TargetMachine's
// DataLayout.
if (AsmHelper.TM) {
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 72a42a6f957ee..0d036c34f2be4 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -54,6 +54,8 @@ const char *Action::getClassName(ActionClass AC) {
return "binary-translator";
case ObjcopyJobClass:
return "objcopy";
+ case ThinLTOMergeJobClass:
+ return "thinlto-merger";
}
llvm_unreachable("invalid class");
@@ -402,6 +404,11 @@ void LinkJobAction::anchor() {}
LinkJobAction::LinkJobAction(ActionList &Inputs, types::ID Type)
: JobAction(LinkJobClass, Inputs, Type) {}
+void ThinLTOMergeJobAction::anchor() {}
+
+ThinLTOMergeJobAction::ThinLTOMergeJobAction(ActionList &Inputs, types::ID Type)
+ : JobAction(ThinLTOMergeJobClass, Inputs, Type) {}
+
void LipoJobAction::anchor() {}
LipoJobAction::LipoJobAction(ActionList &Inputs, types::ID Type)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 4a968a4ce5cc0..139f508cfc1a9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -59,6 +59,7 @@
#include "clang/Basic/Version.h"
#include "clang/Config/config.h"
#include "clang/Driver/Action.h"
+#include "clang/Driver/CommonArgs.h"
#include "clang/Driver/Compilation.h"
#include "clang/Driver/InputInfo.h"
#include "clang/Driver/Job.h"
@@ -4623,8 +4624,18 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
}
// If we ended with something, add to the output list.
- if (Current)
+ if (Current) {
+ // ThinLTO split codegen has cc1 emit one object per partition; append a
+ // merge action (`ld.lld -r`) to recombine them. Gating must match the cc1
+ // side in Clang::ConstructJob; both use isThinLTOSplitMergeEnabled.
+ if (Current->getType() == types::TY_Object &&
+ tools::isThinLTOSplitMergeEnabled(C.getDefaultToolChain(), Args)) {
+ ActionList Inputs;
+ Inputs.push_back(Current);
+ Current = C.MakeAction<ThinLTOMergeJobAction>(Inputs, types::TY_Object);
+ }
Actions.push_back(Current);
+ }
// Add any top level actions generated for offloading.
if (!UseNewOffloadingDriver)
@@ -5721,7 +5732,16 @@ class ToolSelector final {
/// Return true if an assemble action can be collapsed.
bool canCollapseAssembleAction() const {
- return TC.useIntegratedAs() && !SaveTemps &&
+ // ThinLTO split codegen requires multiple native object outputs per task
+ // (AcceptsMultipleOutputsPerTask), which the assembly emission path (-S)
+ // cannot provide. When -save-temps would normally prevent collapsing the
+ // assemble step, still collapse it for the ELF ThinLTO split backend case
+ // so cc1 emits objects directly rather than going through assembly.
+ bool SaveTempsBlock = SaveTemps &&
+ !(C.getArgs().hasArg(options::OPT_fthinlto_index_EQ) &&
+ TC.getTriple().isOSBinFormatELF() &&
+ tools::isThinLTOSplitEnabled(C.getArgs()));
+ return TC.useIntegratedAs() && !SaveTempsBlock &&
!C.getArgs().hasArg(options::OPT_via_file_asm) &&
!C.getArgs().hasArg(options::OPT__SLASH_FA) &&
!C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
diff --git a/clang/lib/Driver/Job.cpp b/clang/lib/Driver/Job.cpp
index da7a1f2e07e90..7af950764054b 100644
--- a/clang/lib/Driver/Job.cpp
+++ b/clang/lib/Driver/Job.cpp
@@ -19,9 +19,12 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/IOSandbox.h"
+#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/PrettyStackTrace.h"
#include "llvm/Support/Program.h"
@@ -452,6 +455,47 @@ void CC1Command::setEnvironment(llvm::ArrayRef<const char *> NewEnvironment) {
"The CC1Command doesn't support changing the environment vars!");
}
+ThinLTOMergeCommand::ThinLTOMergeCommand(
+ const Action &Source, const Tool &Creator,
+ ResponseFileSupport ResponseSupport, const char *Executable,
+ const llvm::opt::ArgStringList &Arguments, ArrayRef<InputInfo> Inputs,
+ ArrayRef<InputInfo> Outputs, StringRef SplitOutputList,
+ bool CleanupSplitOutputs)
+ : Command(Source, Creator, ResponseSupport, Executable, Arguments, Inputs,
+ Outputs),
+ SplitOutputList(SplitOutputList),
+ CleanupSplitOutputs(CleanupSplitOutputs) {}
+
+void ThinLTOMergeCommand::cleanupSplitOutputs() const {
+ // Remove the partition objects listed in the response file. Per-partition
+ // .dwo files (split DWARF) are deliberately kept: they are final debug output
+ // referenced by the merged object's skeleton CUs.
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MBOrErr =
+ llvm::MemoryBuffer::getFile(SplitOutputList, /*IsText=*/true,
+ /*RequiresNullTerminator=*/false);
+ if (!MBOrErr)
+ return;
+
+ llvm::BumpPtrAllocator Alloc;
+ llvm::StringSaver Saver(Alloc);
+ SmallVector<const char *, 16> OutputFiles;
+ llvm::cl::TokenizeGNUCommandLine((*MBOrErr)->getBuffer(), Saver, OutputFiles);
+ for (const char *OutputFile : OutputFiles)
+ llvm::sys::fs::remove(OutputFile);
+}
+
+int ThinLTOMergeCommand::Execute(ArrayRef<std::optional<StringRef>> Redirects,
+ std::string *ErrMsg,
+ bool *ExecutionFailed) const {
+ int Res = Command::Execute(Redirects, ErrMsg, ExecutionFailed);
+ // Clean up the partition inputs only on full success; keep them on any
+ // failure so the failing `ld.lld -r` can be re-run or inspected.
+ bool Launched = !ExecutionFailed || !*ExecutionFailed;
+ if (CleanupSplitOutputs && Launched && Res == 0)
+ cleanupSplitOutputs();
+ return Res;
+}
+
void JobList::Print(raw_ostream &OS, const char *Terminator, bool Quote,
CrashReportInfo *CrashInfo) const {
for (const auto &Job : *this)
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 511eb3757456b..583944b193b11 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -777,6 +777,7 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
case Action::BinaryAnalyzeJobClass:
case Action::BinaryTranslatorJobClass:
case Action::ObjcopyJobClass:
+ case Action::ThinLTOMergeJobClass:
llvm_unreachable("Invalid tool kind.");
case Action::CompileJobClass:
@@ -1222,9 +1223,23 @@ Tool *ToolChain::SelectTool(const JobAction &JA) const {
if (AC == Action::AssembleJobClass && useIntegratedAs() &&
!getTriple().isOSAIX())
return getClangAs();
+ // ThinLTOMergeJobAction is only generated for ELF targets (see
+ // Driver::BuildActions). Assert here to catch accidental routing to
+ // non-ELF toolchains that do not implement the merge logic.
+ if (AC == Action::ThinLTOMergeJobClass) {
+ assert(getTriple().isOSBinFormatELF() &&
+ "ThinLTOMergeJobAction should only be generated for ELF targets");
+ return getLink();
+ }
return getTool(AC);
}
+bool ToolChain::canConstructThinLTOMergeJob() const {
+ ActionList Inputs;
+ LinkJobAction JA(Inputs, types::TY_Image);
+ return SelectTool(JA)->canConstructThinLTOMergeJob();
+}
+
std::string ToolChain::GetFilePath(const char *Name) const {
return D.GetFilePath(Name, *this);
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 05e1f6db80a11..a10cce907ce47 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5399,6 +5399,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
if (!types::isLLVMIR(Input.getType()))
D.Diag(diag::err_drv_arg_requires_bitcode_input) << A->getAsString(Args);
Args.AddLastArg(CmdArgs, options::OPT_fthinlto_index_EQ);
+
+ // For ThinLTO split codegen, have cc1 write each call-graph partition to a
+ // separate object listed in a response file; the driver's
+ // ThinLTOMergeJobAction merges them with `ld.lld -r`. Gate on the default
+ // toolchain (as Driver::BuildActions does) so this flag and the merge action
+ // stay in lockstep. See isThinLTOSplitMergeEnabled.
+ if (Output.isFilename() && Output.getType() == types::TY_Object &&
+ isThinLTOSplitMergeEnabled(C.getDefaultToolChain(), Args))
+ CmdArgs.push_back(Args.MakeArgString(
+ Twine("-thinlto-split-output-list=") +
+ getThinLTOSplitResponseFile(Output.getFilename())));
}
if (Triple.isPPC())
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6f0ac7d5159c1..e52d8212fd496 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -941,6 +941,38 @@ llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
return LtoJobsArg->getValue();
}
+bool tools::isThinLTOSplitEnabled(const ArgList &Args) {
+ std::optional<bool> Enabled;
+ for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
+ for (size_t I = 0, E = A->getNumValues(); I != E; ++I) {
+ StringRef V = A->getValue(I);
+ if (V == "-thinlto-split" || V == "-thinlto-split=true")
+ Enabled = true;
+ else if (V == "-thinlto-split=false")
+ Enabled = false;
+ }
+ }
+ return Enabled.value_or(false);
+}
+
+std::string tools::getThinLTOSplitResponseFile(StringRef Output) {
+ return (Twine(Output) + ".thinlto-split.rsp").str();
+}
+
+bool tools::isThinLTOSplitMergeEnabled(const ToolChain &TC,
+ const ArgList &Args) {
+ // Driver-mediated split applies to the distributed backend compile only: cc1
+ // (-fthinlto-index) emits one object per partition and the driver merges them
+ // with `ld.lld -r`. Gated on -c because the merged object must be the final
+ // output; without -c the object feeds a link step and there is no merge action
+ // (link-time splitting is then handled inside the linker itself). This is the
+ // single gating predicate shared by the cc1 flag and the merge action.
+ return isThinLTOSplitEnabled(Args) &&
+ Args.hasArg(options::OPT_fthinlto_index_EQ) &&
+ Args.hasArg(options::OPT_c) && TC.getTriple().isOSBinFormatELF() &&
+ TC.canConstructThinLTOMergeJob();
+}
+
// PS4/PS5 uses -ffunction-sections and -fdata-sections by default.
bool tools::isUseSeparateSections(const llvm::Triple &Triple) {
return Triple.isPS();
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 131dd725c7289..72c2e93f9f824 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -27,6 +27,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/TargetParser/RISCVISAInfo.h"
@@ -280,6 +281,46 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
const auto &ToolChain = static_cast<const Generic_ELF &>(getToolChain());
const Driver &D = ToolChain.getDriver();
+ // ThinLTOMergeJobAction is ELF-only. See ToolChain::SelectTool for the
+ // routing assertion and Driver::BuildActions for the ELF pre-condition.
+ if (isa<ThinLTOMergeJobAction>(JA)) {
+ ArgStringList CmdArgs;
+ const char *BaseInput = nullptr;
+ for (const auto &II : Inputs) {
+ if (II.isFilename()) {
+ BaseInput = II.getFilename();
+ break;
+ }
+ }
+ assert(BaseInput && "ThinLTO merge job requires an input file");
+
+ // Response file cc1 wrote the partition objects to (shared helper keeps the
+ // name in sync with Clang::ConstructJob). Register it as a Compilation
+ // temporary so it is cleaned up normally (honoring -save-temps).
+ const char *ResponseFile =
+ Args.MakeArgString(tools::getThinLTOSplitResponseFile(BaseInput));
+ C.addTempFile(ResponseFile);
+
+ CmdArgs.push_back("-r");
+ CmdArgs.push_back("-o");
+ CmdArgs.push_back(Output.getFilename());
+ CmdArgs.push_back(Args.MakeArgString(Twine("@") + ResponseFile));
+
+ // Use clang's normal program lookup so -B and configured program paths can
+ // select the matching ld.lld.
+ std::string LLDPath = ToolChain.GetProgramPath("ld.lld");
+ if (!llvm::sys::fs::can_execute(LLDPath)) {
+ D.Diag(clang::diag::err_drv_lto_split_requires_lld) << LLDPath;
+ return;
+ }
+ const char *Exec = Args.MakeArgString(LLDPath);
+
+ C.addCommand(std::make_unique<ThinLTOMergeCommand>(
+ JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs,
+ Output, ResponseFile, !D.isSaveTempsEnabled()));
+ return;
+ }
+
const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
const llvm::Triple::ArchType Arch = ToolChain.getArch();
diff --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index 5fe143b4aa035..5aaccebe1cfde 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -58,6 +58,7 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
bool hasIntegratedCPP() const override { return false; }
bool isLinkJob() const override { return true; }
+ bool canConstructThinLTOMergeJob() const override { return true; }
void ConstructJob(Compilation &C, const JobAction &JA,
const InputInfo &Output, const InputInfoList &Inputs,
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
new file mode 100644
index 0000000000000..e3064684e542a
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
@@ -0,0 +1,169 @@
+; REQUIRES: aarch64-registered-target
+
+; End-to-end test for ThinLTO split codegen + gsplit-dwarf.
+;
+; Verifies that when -gsplit-dwarf is combined with ThinLTO split codegen:
+; 1. Each partition generates its own .dwo file named <stem>.thinlto-split.<Task>.dwo
+; 2. .o and .dwo partition numbering is consistent
+; 3. Each partition's skeleton CU references the correct per-partition .dwo via DW_AT_dwo_name
+; 4. DWO_id in the skeleton CU matches the compile unit in the corresponding .dwo
+; 5. lld -r merge produces a final output.o whose skeleton CUs still reference valid .dwo files
+; 6. No 4294967295.dwo is generated
+; 7. No shared single .dwo file is generated (no output.dwo without partition suffix)
+; 8. Both -gsplit-dwarf=split and -gsplit-dwarf=single produce per-partition .dwo
+
+; --- Step 1: Generate ThinLTO bitcode with debug info ---
+; The IR module must have !dbg metadata to produce DWARF output.
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t.o %s
+
+; --- Step 2: Generate distributed ThinLTO index ---
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN: -o %t.index \
+; RUN: -r=%t.o,caller_a,px \
+; RUN: -r=%t.o,caller_b,px
+
+; --- Step 3: -gsplit-dwarf=split path (both -split-dwarf-file and -split-dwarf-output) ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -thinlto-split-output-list=%t.split.rsp \
+; RUN: -split-dwarf-file %t.split.o.dwo \
+; RUN: -split-dwarf-output %t.split.o.dwo \
+; RUN: -o %t.split.o -x ir %t.o \
+; RUN: -debug-info-kind=constructor -dwarf-version=5 \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; --- Step 3a: Verify per-partition .dwo files exist ---
+; RUN: ls %t.split.o.thinlto-split.0.dwo
+; RUN: ls %t.split.o.thinlto-split.1.dwo
+
+; --- Step 3b: Verify NO single shared .dwo ---
+; RUN: not ls %t.split.o.dwo 2>/dev/null
+
+; --- Step 3c: Verify no 4294967295.dwo ---
+; RUN: not ls %t.split.o.thinlto-split.4294967295.dwo 2>/dev/null
+
+; --- Step 3d: Verify .dwo files are valid ELF ---
+; RUN: llvm-readobj -h %t.split.o.thinlto-split.0.dwo | FileCheck %s --check-prefix=DWO0-ELF
+; DWO0-ELF: Type: Relocatable
+
+; RUN: llvm-readobj -h %t.split.o.thinlto-split.1.dwo | FileCheck %s --check-prefix=DWO1-ELF
+; DWO1-ELF: Type: Relocatable
+
+; --- Step 3e: Verify skeleton CU DW_AT_dwo_name ---
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=SKELETON0
+; SKELETON0: DW_TAG_skeleton_unit
+; SKELETON0: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=SKELETON1
+; SKELETON1: DW_TAG_skeleton_unit
+; SKELETON1: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 3f: Verify DWO_id is present in both skeleton CU and .dwo ---
+; FileCheck variables cannot span separate RUN lines, so we verify presence
+; and format rather than exact value equality across files. End-to-end shell
+; testing confirms DWO_id values match between skeleton CU and .dwo.
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=DWO_ID0
+; DWO_ID0: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.dwo | FileCheck %s --check-prefix=DWO_DWO0
+; DWO_DWO0: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=DWO_ID1
+; DWO_ID1: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.dwo | FileCheck %s --check-prefix=DWO_DWO1
+; DWO_DWO1: DWO_id = 0x{{[0-9a-f]+}}
+
+; --- Step 4: Verify lld -r merge preserves .dwo references ---
+; RUN: ld.lld -r -o %t.merged.o %t.split.o.thinlto-split.0.o %t.split.o.thinlto-split.1.o
+
+; RUN: llvm-readobj -h %t.merged.o | FileCheck %s --check-prefix=MERGED-ELF
+; MERGED-ELF: Type: Relocatable
+
+; RUN: llvm-dwarfdump -debug-info %t.merged.o | FileCheck %s --check-prefix=MERGED-DWO
+; MERGED-DWO: DW_TAG_skeleton_unit
+; MERGED-DWO: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+; MERGED-DWO: DW_TAG_skeleton_unit
+; MERGED-DWO: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 5: -gsplit-dwarf=single path (only -split-dwarf-file, no -split-dwarf-output) ---
+; -gsplit-dwarf=single tells cc1 to embed .dwo content into the .o file and
+; not write a separate .dwo file. However, when ThinLTO split is active,
+; per-partition .dwo naming must still be used so each partition's skeleton CU
+; can reference its own .dwo. This tests that SplitDwarfOutputStem is derived
+; from SplitDwarfFile when SplitDwarfOutput is absent.
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -thinlto-split-output-list=%t.single.rsp \
+; RUN: -split-dwarf-file %t.single.o.dwo \
+; RUN: -o %t.single.o -x ir %t.o \
+; RUN: -debug-info-kind=constructor -dwarf-version=5 \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; --- Step 5a: Verify per-partition .dwo files exist even without -split-dwarf-output ---
+; RUN: ls %t.single.o.thinlto-split.0.dwo
+; RUN: ls %t.single.o.thinlto-split.1.dwo
+
+; --- Step 5b: Verify skeleton CU DW_AT_dwo_name ---
+; RUN: llvm-dwarfdump -debug-info %t.single.o.thinlto-split.0.o | FileCheck %s --check-prefix=SINGLE0
+; SINGLE0: DW_TAG_skeleton_unit
+; SINGLE0: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+
+; RUN: llvm-dwarfdump -debug-info %t.single.o.thinlto-split.1.o | FileCheck %s --check-prefix=SINGLE1
+; SINGLE1: DW_TAG_skeleton_unit
+; SINGLE1: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 6: Verify non-split path with gsplit-dwarf still works ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -split-dwarf-file %t.nosplit.o.dwo \
+; RUN: -split-dwarf-output %t.nosplit.o.dwo \
+; RUN: -o %t.nosplit.o -x ir %t.o \
+; RUN: -debug-info-kind=constructor -dwarf-version=5 \
+; RUN: -mllvm -thinlto-split=false
+
+; RUN: ls %t.nosplit.o.dwo
+; RUN: not ls %t.nosplit.o.thinlto-split.0.dwo 2>/dev/null
+
+; --- IR source module with debug metadata ---
+target triple = "aarch64-unknown-linux-gnu"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "source.c", directory: "/test")
+!2 = !DISubroutineType(types: !{})
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+
+define internal void @shared() !dbg !5 {
+entry:
+ ret void
+}
+
+define void @caller_a() !dbg !7 {
+entry:
+ call void @shared(), !dbg !9
+ ret void
+}
+
+define void @caller_b() !dbg !10 {
+entry:
+ call void @shared(), !dbg !12
+ ret void
+}
+
+!5 = distinct !DISubprogram(name: "shared", scope: !1, file: !1, line: 1, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!7 = distinct !DISubprogram(name: "caller_a", scope: !1, file: !1, line: 4, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!9 = !DILocation(line: 5, column: 3, scope: !7)
+!10 = distinct !DISubprogram(name: "caller_b", scope: !1, file: !1, line: 8, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!12 = !DILocation(line: 9, column: 3, scope: !10)
\ No newline at end of file
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list.ll b/clang/test/CodeGen/thinlto-split/split-output-list.ll
new file mode 100644
index 0000000000000..88abd8dd87019
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-output-list.ll
@@ -0,0 +1,75 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -thinlto-bc -o %t.o %s
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN: -o %t.index \
+; RUN: -r=%t.o,caller_a,px \
+; RUN: -r=%t.o,caller_b,px
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -o %t.split.o -x ir %t.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 \
+; RUN: -thinlto-split-output-list=%t.split.rsp
+; RUN: FileCheck %s --check-prefix=SPLIT-RSP --input-file=%t.split.rsp
+; RUN: llvm-nm %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=NM0
+; RUN: llvm-nm %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=NM1
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -o %t.skip.o -x ir %t.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -thinlto-split-output-list=%t.skip.rsp
+; RUN: FileCheck %s --check-prefix=SKIP-RSP --input-file=%t.skip.rsp
+
+; SPLIT-RSP: {{.*}}.split.o.thinlto-split.0.o
+; SPLIT-RSP-NEXT: {{.*}}.split.o.thinlto-split.1.o
+; SPLIT-RSP-NOT: {{.*}}.split.o{{$}}
+; SPLIT-RSP-NOT: thinlto-split.2.o
+; SPLIT-RSP-NOT: {{.*}}.merged.o
+; SPLIT-RSP-NOT: 4294967295
+
+; SKIP-RSP: {{.*}}.skip.o.thinlto-split.0.o
+; SKIP-RSP-NOT: {{.*}}.skip.o{{$}}
+; SKIP-RSP-NOT: thinlto-split.1.o
+; SKIP-RSP-NOT: 4294967295
+
+; Verify that the user-specified -o path is empty (0 bytes) when split codegen
+; is active. In the split path, cc1 replaces the original output stream with a
+; null stream so no object content lands in the -o file. The driver's ld.lld -r
+; merge produces the final -o content separately.
+; RUN: wc -c %t.split.o | FileCheck %s --check-prefix=EMPTY-SPLIT
+; EMPTY-SPLIT: 0
+; RUN: wc -c %t.skip.o | FileCheck %s --check-prefix=EMPTY-SKIP
+; EMPTY-SKIP: 0
+
+; NM0: T caller_b
+; NM0: T {{.*shared[._][0-9a-f]+.*}}
+; NM0-NOT: T shared{{$}}
+
+; NM1: T caller_a
+; NM1: U {{.*shared[._][0-9a-f]+.*}}
+; NM1-NOT: T shared{{$}}
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define internal void @shared() {
+entry:
+ ret void
+}
+
+define void @caller_a() {
+entry:
+ call void @shared()
+ ret void
+}
+
+define void @caller_b() {
+entry:
+ call void @shared()
+ ret void
+}
diff --git a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
similarity index 58%
rename from llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
rename to clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
index 6c51141a9ad85..2cf3aa41a9c75 100644
--- a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
+++ b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
@@ -1,13 +1,21 @@
+; REQUIRES: aarch64-registered-target
; Test that internal symbols promoted during module splitting are consistently
; renamed with an MD5 suffix across all partitions.
;
-; RUN: opt -module-summary %s -o %t.bc
-; RUN: llvm-lto2 run %t.bc -o %t \
-; RUN: -thinlto-split=true \
-; RUN: -thinlto-split-partitions=2 -thinlto-split-module-size-threshold=0 \
-; RUN: -r=%t.bc,caller_a,px \
-; RUN: -r=%t.bc,caller_b,px
-; RUN: llvm-nm %t.1 | FileCheck %s
+; RUN: opt -thinlto-bc -o %t.o %s
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN: -o %t.index \
+; RUN: -r=%t.o,caller_a,px \
+; RUN: -r=%t.o,caller_b,px
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -o %t.split.o -x ir %t.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=1 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -thinlto-split-output-list=%t.split.rsp
+; RUN: llvm-nm %t.split.o.thinlto-split.0.o | FileCheck %s
; CHECK-DAG: T caller_a
; CHECK-DAG: T caller_b
@@ -15,7 +23,7 @@
; CHECK-NOT: T promoted_internal{{$}}
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "aarch64-unknown-linux-gnu"
; @promoted_internal is internal. SplitModuleCG::dealWithMpart's checkPromoted
; records it in PromotedRenames. splitOptAndCodeGenThin applies the rename
diff --git a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
new file mode 100644
index 0000000000000..c024e4c37f2fb
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
@@ -0,0 +1,251 @@
+; REQUIRES: aarch64-registered-target
+
+; End-to-end test for ThinLTO split + Driver-mediated lld merge.
+;
+; This test simulates a realistic distributed ThinLTO backend scenario:
+; 1. Generate ThinLTO bitcode from an IR module containing multiple functions,
+; global variables, global constructors (init_array), internal functions
+; referenced across partitions, comdat/weak symbols.
+; 2. Generate the distributed ThinLTO index.
+; 3. Invoke clang Driver to compile with -fthinlto-index=... and -thinlto-split,
+; producing partition objects and lld -r merging them into a single output.o.
+; 4. Validate the merged output.o is a valid ELF relocatable object with
+; expected symbols, sections, and correct RSP ordering.
+
+; --- Step 1: Generate ThinLTO bitcode ---
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t.o %s
+
+; --- Step 2: Generate distributed ThinLTO index ---
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN: -o %t.index \
+; RUN: -r=%t.o,func_a,px \
+; RUN: -r=%t.o,func_b,px \
+; RUN: -r=%t.o,func_c,px \
+; RUN: -r=%t.o,func_d,px \
+; RUN: -r=%t.o,func_e,px \
+; RUN: -r=%t.o,weak_func,px \
+; RUN: -r=%t.o,g_global,px \
+; RUN: -r=%t.o,g_ctor_data,px \
+; RUN: -r=%t.o,comdat_var,px
+
+; --- Step 3: clang_cc1 split path — verify partition objects and RSP ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -thinlto-split-output-list=%t.cc1.rsp \
+; RUN: -o %t.cc1.o -x ir %t.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; Verify RSP contains partition objects in order 0, 1
+; RSP-RSP: {{.*\.thinlto-split\.0\.o}}
+; RSP-RSP-NEXT: {{.*\.thinlto-split\.1\.o}}
+; RSP should NOT contain the bare -o output
+; RSP-RSP-NOT: {{\.cc1\.o$}}
+; RSP should NOT contain uint32_max (4294967295)
+; RSP-RSP-NOT: 4294967295
+
+; Verify partition objects are valid ELF
+; RUN: llvm-readobj -h %t.cc1.o.thinlto-split.0.o | FileCheck %s --check-prefix=PART0-ELF
+; PART0-ELF: Type: Relocatable
+
+; RUN: llvm-readobj -h %t.cc1.o.thinlto-split.1.o | FileCheck %s --check-prefix=PART1-ELF
+; PART1-ELF: Type: Relocatable
+
+; Verify inter-partition symbol promotion: shared_helper should have hash suffix
+; RUN: llvm-nm %t.cc1.o.thinlto-split.0.o | FileCheck %s --check-prefix=NM0
+; NM0-DAG: T {{shared_helper[._]}}
+; NM0-DAG: T func_
+; NM0-DAG: D g_global
+; NM0-DAG: V comdat_var
+
+; RUN: llvm-nm %t.cc1.o.thinlto-split.1.o | FileCheck %s --check-prefix=NM1
+; NM1-DAG: U {{shared_helper[._]}}
+; NM1-DAG: T func_
+
+; Verify the cc1 -o path exists but is empty (split path writes partition objects instead)
+; RUN: wc -c %t.cc1.o | FileCheck %s --check-prefix=EMPTY-O
+; EMPTY-O: 0
+
+; --- Step 4: clang Driver split path — verify full end-to-end ---
+; The Driver must: (a) invoke cc1 with -thinlto-split-output-list, (b) invoke
+; sibling ld.lld -r to merge partition objects, (c) produce a single valid output.o
+
+; First, verify the Driver generates correct command lines (-###)
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN: -B%S/Inputs/lld \
+; RUN: -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.driver.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=DRIVER
+
+; DRIVER: "-cc1"
+; DRIVER-SAME: "-fthinlto-index={{.*}}.thinlto.bc"
+; DRIVER-SAME: "-thinlto-split-output-list=[[RSP:[^"]+\.thinlto-split\.rsp]]"
+; DRIVER: "{{.*}}ld.lld" "-r" "-o" "{{.*}}driver.o" "@[[RSP]]"
+
+; Non-split path should NOT have -thinlto-split-output-list or ld.lld
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN: -B%S/Inputs/lld \
+; RUN: -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.nosplit_driver.o \
+; RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
+
+; NOSPLIT-DRIVER: "-cc1"
+; NOSPLIT-DRIVER-NOT: thinlto-split-output-list
+; NOSPLIT-DRIVER-NOT: ld.lld
+
+; --- Step 4b: Verify -save-temps + ThinLTO split ---
+; -save-temps normally prevents collapsing the assemble step, but ThinLTO split
+; must still emit objects directly (-emit-obj) because the assembly path cannot
+; produce multiple partition outputs (AcceptsMultipleOutputsPerTask).
+
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN: -B%S/Inputs/lld \
+; RUN: -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save.o \
+; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -thinlto-split-partitions=2 \
+; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+
+; cc1 must use -emit-obj (not -S) even with -save-temps
+; SAVE-TEMPS: "-cc1"
+; SAVE-TEMPS-SAME: "-emit-obj"
+; SAVE-TEMPS-SAME: "-thinlto-split-output-list=[[SAVE_RSP:[^"]+\.thinlto-split\.rsp]]"
+; SAVE-TEMPS: ld.lld{{.*}}-r{{.*}}-o{{.*}}@[[SAVE_RSP]]
+
+; Verify ordinary -save-temps without split still uses -S (not collapsed)
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN: -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save_nosplit.o \
+; RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
+
+; SAVE-TEMPS-NOSPLIT: "-cc1"
+; SAVE-TEMPS-NOSPLIT-SAME: "-S"
+; SAVE-TEMPS-NOSPLIT-NOT: thinlto-split-output-list
+
+; --- Step 5: Verify merged output.o is valid ---
+; Use clang_cc1 + ld.lld directly to verify the merge produces a valid ELF.
+; (We cannot use %clang Driver directly because it needs ld.lld in PATH,
+; and lit test environments may not guarantee that. So we verify the merge
+; result by manually running ld.lld on the partition objects.)
+
+; RUN: ld.lld -r -o %t.merged.o %t.cc1.o.thinlto-split.0.o %t.cc1.o.thinlto-split.1.o
+
+; merged.o must be a valid ELF relocatable object
+; RUN: llvm-readobj -h %t.merged.o | FileCheck %s --check-prefix=MERGED-ELF
+; MERGED-ELF: Type: Relocatable
+
+; merged.o must contain .init_array section (from global constructor)
+; RUN: llvm-readobj -S %t.merged.o | FileCheck %s --check-prefix=MERGED-SECTIONS
+; MERGED-SECTIONS: Name: .init_array
+; MERGED-SECTIONS: Name: .group
+
+; merged.o must contain symbols from BOTH partitions
+; RUN: llvm-nm %t.merged.o | FileCheck %s --check-prefix=MERGED-NM
+; MERGED-NM-DAG: T func_a
+; MERGED-NM-DAG: T func_b
+; MERGED-NM-DAG: T func_c
+; MERGED-NM-DAG: T func_d
+; MERGED-NM-DAG: T func_e
+; MERGED-NM-DAG: W weak_func
+; MERGED-NM-DAG: {{D|B}} g_global
+; MERGED-NM-DAG: {{D|B}} g_ctor_data
+; MERGED-NM-DAG: {{V|v}} comdat_var
+; Internal symbols promoted across partitions
+; MERGED-NM-DAG: T {{.*shared_helper[._]}}
+; MERGED-NM-DAG: T {{.*ctor_init[._]}}
+
+; --- Step 6: Verify non-split path produces correct output ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN: -o %t.nosplit.o -x ir %t.o
+
+; RUN: llvm-readobj -h %t.nosplit.o | FileCheck %s --check-prefix=NOSPLIT-ELF
+; NOSPLIT-ELF: Type: Relocatable
+
+; RUN: llvm-nm %t.nosplit.o | FileCheck %s --check-prefix=NOSPLIT-NM
+; NOSPLIT-NM-DAG: T func_a
+; NOSPLIT-NM-DAG: T func_b
+; NOSPLIT-NM-DAG: t shared_helper
+; NOSPLIT-NM-DAG: t ctor_init
+
+; Verify no partition objects leaked in non-split path
+; RUN: not ls %t.nosplit.o.thinlto-split.0.o 2>/dev/null
+; RUN: not ls %t.nosplit.o.thinlto-split.1.o 2>/dev/null
+
+; --- IR source module ---
+; Realistic module with features that stress the split+merge path:
+; - Internal function shared_helper referenced by multiple roots (promoted across partitions)
+; - Global constructor (init_array) with ctor_init in .text.startup
+; - Comdat group with weak_odr variable
+; - Weak function
+; - Multiple root functions to force 2+ partitions
+
+target triple = "aarch64-unknown-linux-gnu"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+ at g_global = global i32 42
+ at g_ctor_data = global i32 0
+
+ at llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [
+ { i32, ptr, ptr } { i32 65535, ptr @ctor_init, ptr @g_ctor_data }
+]
+
+$comdat_grp = comdat any
+ at comdat_var = weak_odr global i32 10, comdat($comdat_grp)
+
+define internal void @shared_helper() {
+entry:
+ store volatile i32 1, ptr @g_global, align 4
+ ret void
+}
+
+define weak void @weak_func() {
+entry:
+ ret void
+}
+
+define internal void @ctor_init() section ".text.startup" {
+entry:
+ store i32 100, ptr @g_ctor_data, align 4
+ ret void
+}
+
+define void @func_a() {
+entry:
+ call void @shared_helper()
+ call void @weak_func()
+ %val = load i32, ptr @comdat_var, align 4
+ %sum = add i32 %val, 1
+ store i32 %sum, ptr @g_global, align 4
+ ret void
+}
+
+define void @func_b() {
+entry:
+ call void @shared_helper()
+ store volatile i32 2, ptr @g_global, align 4
+ ret void
+}
+
+define void @func_c() {
+entry:
+ call void @shared_helper()
+ ret void
+}
+
+define void @func_d() {
+entry:
+ %v = load i32, ptr @g_global, align 4
+ %r = add i32 %v, 10
+ store i32 %r, ptr @g_global, align 4
+ call void @shared_helper()
+ ret void
+}
+
+define void @func_e() {
+entry:
+ call void @weak_func()
+ ret void
+}
\ No newline at end of file
diff --git a/clang/test/Driver/thinlto-split-merge.c b/clang/test/Driver/thinlto-split-merge.c
new file mode 100644
index 0000000000000..eb062754729ee
--- /dev/null
+++ b/clang/test/Driver/thinlto-split-merge.c
@@ -0,0 +1,64 @@
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true \
+// RUN: -mllvm -thinlto-split-partitions=2 2>&1 | FileCheck %s --check-prefix=MERGE
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=LLD
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN: -save-temps -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+// RUN: rm -rf %t.empty
+// RUN: mkdir -p %t.empty
+// RUN: not env PATH= %clang -### -ccc-install-dir %t.empty \
+// RUN: -target aarch64-unknown-linux-gnu \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
+// RUN: %clang -### -target x86_64-unknown-freebsd \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
+// RUN: %clang -### -target x86_64-unknown-fuchsia \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
+// RUN: %clang -### -target x86_64-none-elf \
+// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
+
+// MERGE: "-cc1"
+// MERGE-SAME: "-fthinlto-index=foo.thinlto.bc"
+// MERGE-SAME: "-thinlto-split-output-list=[[RSP:[^"]+\.thinlto-split\.rsp]]"
+// MERGE-SAME: "-o" "[[TEMP_O:[^"]+\.o]]"
+// MERGE: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[RSP]]"
+
+// LLD: "-cc1"
+// LLD-SAME: "-thinlto-split-output-list=[[LLD_RSP:[^"]+\.thinlto-split\.rsp]]"
+// LLD: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[LLD_RSP]]"
+
+// NOSPLIT: "-cc1"
+// NOSPLIT-NOT: thinlto-split-output-list
+// NOSPLIT-NOT: ld.lld
+
+// SAVE-TEMPS: "-cc1"
+// SAVE-TEMPS-SAME: "-emit-obj"
+// SAVE-TEMPS-SAME: "-thinlto-split-output-list=[[SAVE_RSP:[^"]+\.thinlto-split\.rsp]]"
+// SAVE-TEMPS: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[SAVE_RSP]]"
+
+// MISSING-LLD: error: cannot find 'ld.lld' required for ThinLTO split codegen
+
+// FREEBSD: "-cc1"
+// FREEBSD-SAME: "-fthinlto-index=foo.thinlto.bc"
+// FREEBSD-NOT: thinlto-split-output-list
+// FREEBSD-NOT: "-r"
+
+// FUCHSIA: "-cc1"
+// FUCHSIA-SAME: "-fthinlto-index=foo.thinlto.bc"
+// FUCHSIA-NOT: thinlto-split-output-list
+// FUCHSIA-NOT: "-r"
+
+// BAREMETAL: "-cc1"
+// BAREMETAL-SAME: "-fthinlto-index=foo.thinlto.bc"
+// BAREMETAL-NOT: thinlto-split-output-list
+// BAREMETAL-NOT: "-r"
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 27be8859a3f65..a222f01502cd3 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -22,6 +22,7 @@
#include "llvm/DTLTO/DTLTO.h"
#include "llvm/LTO/Config.h"
#include "llvm/LTO/LTO.h"
+#include "llvm/LTO/LTOBackend.h"
#include "llvm/Support/Caching.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/MemoryBuffer.h"
@@ -173,6 +174,17 @@ static lto::Config createConfig(Ctx &ctx) {
checkError(ctx.e, c.addSaveTemps(ctx.arg.outputFile.str() + ".",
/*UseInputModulePath*/ true,
ctx.arg.saveTempsArgs));
+
+ // With ThinLTO split codegen the in-process backend emits one object per
+ // call-graph partition, which lld consumes directly. Opt in to multiple
+ // outputs per task and the expanded task-id layout (T*Stride+p) so partitions
+ // from different modules occupy distinct slots. Index-only and DTLTO do not
+ // run codegen in lld, so they keep the defaults.
+ if (lto::isThinLTOSplitEnabled() && !ctx.arg.thinLTOIndexOnly &&
+ ctx.arg.dtltoDistributor.empty()) {
+ c.AcceptsMultipleOutputsPerTask = true;
+ c.UseExpandedThinLTOSplitTaskIds = true;
+ }
return c;
}
@@ -382,8 +394,12 @@ SmallVector<std::unique_ptr<InputFile>, 0> BitcodeCompiler::compile() {
if (!ctx.arg.ltoObjPath.empty()) {
saveBuffer(buf[0].second, ctx.arg.ltoObjPath);
+ // With ThinLTO split codegen the task-id space is sparse (only a few of the
+ // Stride slots per module are used), so skip empty slots instead of writing
+ // a zero-length file for every gap.
for (unsigned i = 1; i != maxTasks; ++i)
- saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
+ if (!buf[i].second.empty())
+ saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
}
bool savePrelink = ctx.arg.saveTempsArgs.contains("prelink");
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 2aeb902bcfccf..8cc72ae5a7ea0 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -105,6 +105,20 @@ struct Config {
/// distinguished.
mutable bool Dtlto = 0;
+ /// True if the client can receive multiple native objects per logical
+ /// ThinLTO task (required to consume ThinLTO split codegen; sized via
+ /// LTO::getMaxTasks()). Clients that leave it false hard-error on a split.
+ bool AcceptsMultipleOutputsPerTask = false;
+
+ /// Report ThinLTO split partition `p` of task `T` as id
+ /// `T * ThinLTOSplitTaskIdStride + p` (disjoint ranges per module). Set by
+ /// in-process clients like lld; the distributed backend leaves it false.
+ bool UseExpandedThinLTOSplitTaskIds = false;
+
+ /// Max partition objects per task under UseExpandedThinLTOSplitTaskIds. Zero
+ /// lets LTO::getMaxTasks() pick a value; must be non-zero once backends run.
+ unsigned ThinLTOSplitTaskIdStride = 0;
+
/// Allows non-imported definitions to get the potentially more constraining
/// visibility from the prevailing definition. FromPrevailing is the default
/// because it works for many binary formats. ELF can use the more optimized
@@ -152,6 +166,9 @@ struct Config {
/// all .dwo files will be written to the same path. Not used in skeleton CU.
std::string SplitDwarfOutput;
+ /// Stem for per-partition ThinLTO split .dwo files.
+ std::string SplitDwarfOutputStem;
+
/// Optimization remarks file path.
std::string RemarksFilename;
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index b7f904aebaa66..0b42009e58eb0 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -666,6 +666,11 @@ class LTO {
Error checkPartiallySplit();
+ /// ThinLTO split task-id stride (see Config::ThinLTOSplitTaskIdStride);
+ /// cached so getMaxTasks() and runThinLTO() agree. Returns 1 when off.
+ unsigned getThinLTOSplitTaskIdStride() const;
+ mutable std::optional<unsigned> ThinLTOSplitTaskIdStrideCache;
+
mutable bool CalledGetMaxTasks = false;
// LTO mode when using Unified LTO.
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 4bb38529ec754..08551445d5381 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -34,6 +34,14 @@ class Target;
namespace lto {
+/// Returns true if ThinLTO split codegen (`-thinlto-split`) is requested, in
+/// which case one logical task may emit several native objects.
+LLVM_ABI bool isThinLTOSplitEnabled();
+
+/// Upper bound on partition objects per task when split is active: the
+/// configured `-thinlto-split-partitions`, or 0 (no static bound) / 1 (off).
+LLVM_ABI unsigned getThinLTOSplitMaxPartitions();
+
/// Runs middle-end LTO optimizations on \p Mod.
LLVM_ABI bool opt(const Config &Conf, TargetMachine *TM, unsigned Task,
Module &Mod, bool IsThinLTO,
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a811cddfb2348..e18ddee1c7efd 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -63,6 +63,7 @@
#include "llvm/Transforms/Utils/FunctionImportUtils.h"
#include "llvm/Transforms/Utils/SplitModule.h"
+#include <limits>
#include <optional>
#include <set>
@@ -108,6 +109,14 @@ void LTO::emitRemark(OptimizationRemark &Remark) {
static cl::opt<bool>
DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
+
+/// Caps the auto-selected ThinLTO split partitions per module (when
+/// -thinlto-split-partitions is 0), bounding the task-id stride and the
+/// client's getMaxTasks()-sized output table.
+static cl::opt<unsigned> MaxAutoThinLTOSplitPartitions(
+ "max-auto-thinlto-split-partitions", cl::init(32), cl::Hidden,
+ cl::desc("Cap on auto-selected ThinLTO split partitions per module"));
+
namespace llvm {
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
extern cl::opt<bool> ForceImportAll;
@@ -1264,11 +1273,51 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
return Res;
}
+unsigned LTO::getThinLTOSplitTaskIdStride() const {
+ if (!Conf.UseExpandedThinLTOSplitTaskIds)
+ return 1;
+ if (ThinLTOSplitTaskIdStrideCache)
+ return *ThinLTOSplitTaskIdStrideCache;
+
+ // The stride is a hard per-task budget: splitOptAndCodeGenThin clamps the
+ // splitter to it, so the partition count can never exceed it and the task-id
+ // layout T*Stride+p stays collision-free. Prefer the configured value so
+ // getMaxTasks() (called before run()) and runThinLTO() agree on the stride.
+ unsigned Stride = getThinLTOSplitMaxPartitions();
+ if (Stride == 0) {
+ // No explicit count: size from the largest module's defined-symbol count,
+ // capped so a huge module cannot blow up the task-id space.
+ DenseMap<StringRef, GVSummaryMapTy> ModuleToDefinedGVSummaries(
+ ThinLTO.ModuleMap.size());
+ ThinLTO.CombinedIndex.collectDefinedGVSummariesPerModule(
+ ModuleToDefinedGVSummaries);
+ Stride = 1;
+ for (const auto &Mod : ModuleToDefinedGVSummaries)
+ Stride = std::max<unsigned>(Stride, Mod.second.size());
+ Stride = std::min<unsigned>(Stride, MaxAutoThinLTOSplitPartitions);
+ }
+ ThinLTOSplitTaskIdStrideCache = Stride;
+ return Stride;
+}
+
unsigned LTO::getMaxTasks() const {
CalledGetMaxTasks = true;
- auto ModuleCount = ThinLTO.ModulesToCompile ? ThinLTO.ModulesToCompile->size()
- : ThinLTO.ModuleMap.size();
- return RegularLTO.ParallelCodeGenParallelismLevel + ModuleCount;
+ uint64_t ModuleCount = ThinLTO.ModulesToCompile
+ ? ThinLTO.ModulesToCompile->size()
+ : ThinLTO.ModuleMap.size();
+ // Split codegen reports partition `p` of task `T` as id `T*Stride+p`, whose
+ // max is `(Parallel+ModuleCount)*Stride - 1`; size the table for that. With
+ // Stride == 1 (no expansion) this is the usual `Parallel + ModuleCount`.
+ uint64_t Stride = getThinLTOSplitTaskIdStride();
+ uint64_t MaxTasks =
+ (uint64_t(RegularLTO.ParallelCodeGenParallelismLevel) + ModuleCount) *
+ Stride;
+ // Refuse to truncate the unsigned task id (would under-size the client table
+ // and let a later partition write out of bounds).
+ if (MaxTasks > std::numeric_limits<unsigned>::max())
+ report_fatal_error("ThinLTO split codegen task id space overflow; reduce "
+ "-thinlto-split-partitions or the number of inputs.");
+ return unsigned(MaxTasks);
}
// If only some of the modules were split, we cannot correctly handle
@@ -1654,9 +1703,12 @@ class InProcessThinBackend : public CGThinBackend {
if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
- [](uint32_t V) { return V == 0; }))
+ [](uint32_t V) { return V == 0; }) ||
+ Conf.AcceptsMultipleOutputsPerTask)
// Cache disabled or no entry for this module in the combined index or
- // no module hash.
+ // no module hash. Also bypass when a task may emit several objects
+ // (ThinLTO split): the cache stores exactly one object per task, so split
+ // partitions would collide on one entry; run uncached instead.
return RunThinBackend(AddStream);
// The module may be cached, this helps handling it.
@@ -1774,9 +1826,12 @@ class FirstRoundThinBackend : public InProcessThinBackend {
"Both caches for CG and IR should have matching availability");
if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
- [](uint32_t V) { return V == 0; }))
+ [](uint32_t V) { return V == 0; }) ||
+ Conf.AcceptsMultipleOutputsPerTask)
// Cache disabled or no entry for this module in the combined index or
- // no module hash.
+ // no module hash. Also bypass the cache for ThinLTO split codegen (one
+ // logical task may emit multiple partition objects; see the comment in
+ // InProcessThinBackend::runThinLTOBackendThread).
return RunThinBackend(CGAddStream, IRAddStream);
// Get CGKey for caching object in CGCache.
@@ -1860,9 +1915,12 @@ class SecondRoundThinBackend : public InProcessThinBackend {
};
if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
- [](uint32_t V) { return V == 0; }))
+ [](uint32_t V) { return V == 0; }) ||
+ Conf.AcceptsMultipleOutputsPerTask)
// Cache disabled or no entry for this module in the combined index or
- // no module hash.
+ // no module hash. Also bypass the cache for ThinLTO split codegen (one
+ // logical task may emit multiple partition objects; see the comment in
+ // InProcessThinBackend::runThinLTOBackendThread).
return RunThinBackend(AddStream);
// Get Key for caching the final object file in Cache with the combined
@@ -2066,6 +2124,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
if (!ModuleToDefinedGVSummaries.count(Mod.first))
ModuleToDefinedGVSummaries.try_emplace(Mod.first);
+ // Lock in the (cached) stride from getMaxTasks() so the client's output table
+ // and the task ids the backends produce stay consistent.
+ if (Conf.UseExpandedThinLTOSplitTaskIds && Conf.ThinLTOSplitTaskIdStride == 0)
+ Conf.ThinLTOSplitTaskIdStride = getThinLTOSplitTaskIdStride();
+
FunctionImporter::ImportListsTy ImportLists(ThinLTO.ModuleMap.size());
DenseMap<StringRef, FunctionImporter::ExportSetTy> ExportLists(
ThinLTO.ModuleMap.size());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index ef78f1fa8ac3a..c65187c838bb6 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -34,10 +34,8 @@
#include "llvm/Plugins/PassPlugin.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Support/VirtualFileSystem.h"
@@ -49,6 +47,7 @@
#include "llvm/Transforms/Utils/SplitModule.h"
#include "llvm/Transforms/Utils/SplitModuleCG.h"
#include <filesystem>
+#include <limits>
#include <optional>
using namespace llvm;
@@ -105,6 +104,41 @@ namespace llvm {
extern cl::opt<bool> NoPGOWarnMismatch;
}
+bool lto::isThinLTOSplitEnabled() { return ThinLTOSplit; }
+
+unsigned lto::getThinLTOSplitMaxPartitions() {
+ if (!ThinLTOSplit)
+ return 1;
+ // ThinLTOSplitPartitions==0 means "decide per module from the call graph",
+ // so there is no static upper bound the client can rely on.
+ return ThinLTOSplitPartitions;
+}
+
+static unsigned getThinLTOOutputTask(const Config &C, unsigned LogicalTask,
+ unsigned PartitionId) {
+ if (!C.UseExpandedThinLTOSplitTaskIds)
+ return C.AcceptsMultipleOutputsPerTask ? PartitionId : LogicalTask;
+
+ if (C.ThinLTOSplitTaskIdStride == 0)
+ report_fatal_error(
+ "ThinLTO split codegen expanded task ids require a non-zero stride.");
+ if (PartitionId >= C.ThinLTOSplitTaskIdStride)
+ report_fatal_error(
+ "ThinLTO split codegen produced more partitions than the task id "
+ "stride allows.");
+
+ uint64_t PhysicalTask =
+ uint64_t(LogicalTask) * C.ThinLTOSplitTaskIdStride + PartitionId;
+ if (PhysicalTask > std::numeric_limits<unsigned>::max())
+ report_fatal_error("ThinLTO split codegen task id overflow.");
+ return unsigned(PhysicalTask);
+}
+
+static unsigned getThinLTOSingleOutputTask(const Config &C,
+ unsigned LogicalTask) {
+ return getThinLTOOutputTask(C, LogicalTask, /*PartitionId=*/0);
+}
+
[[noreturn]] static void reportOpenError(StringRef Path, Twine Msg) {
errs() << "failed to open " << Path << ": " << Msg << '\n';
errs().flush();
@@ -486,6 +520,17 @@ static void codegen(const Config &Conf, TargetMachine *TM,
DwoFile = Conf.DwoDir;
sys::path::append(DwoFile, std::to_string(Task) + ".dwo");
TM->Options.MCOptions.SplitDwarfFile = std::string(DwoFile);
+ } else if (!Conf.SplitDwarfOutputStem.empty()) {
+ DwoFile = (Twine(Conf.SplitDwarfOutputStem) + ".thinlto-split." +
+ Twine(Task) + ".dwo").str();
+ // Ensure the parent directory exists (same directory as the .o output).
+ SmallString<128> DwoParent(sys::path::parent_path(DwoFile));
+ if (!DwoParent.empty()) {
+ if (auto EC = llvm::sys::fs::create_directories(DwoParent))
+ report_fatal_error(Twine("Failed to create directory ") + DwoParent +
+ ": " + EC.message());
+ }
+ TM->Options.MCOptions.SplitDwarfFile = std::string(DwoFile);
} else
TM->Options.MCOptions.SplitDwarfFile = Conf.SplitDwarfFile;
@@ -567,40 +612,6 @@ static bool HasLargeCG(Module &Mod, const ModuleSummaryIndex &CombinedIndex) {
return true;
}
-struct TaskIdAllocator {
- using TaskId = unsigned;
-
- // Use the most significant bit (MSB) as a namespace tag.
- // - Original ThinLTO backend tasks are expected to have MSB == 0.
- // - Split partitions allocated by this allocator always have MSB == 1.
- // This guarantees the two ID spaces never overlap.
- static constexpr TaskId tag() {
- return TaskId{1} << (std::numeric_limits<TaskId>::digits - 1);
- }
-
- // Monotonic sequence counter for split partitions (MSB must remain 0 here).
- std::atomic<TaskId> seq{0};
-
- // Allocate a globally unique TaskId for a split partition.
- // The returned ID is `tag() | seq`, so it lives in the MSB==1 namespace.
- TaskId alloc() {
- TaskId v = seq.fetch_add(1, std::memory_order_relaxed);
-
- // If the counter ever reaches the MSB, we'd overlap namespaces.
- // This indicates an overflow / too many partitions.
- if (v & tag())
- report_fatal_error("Partition TaskId overflow: seq reached the tag bit.");
-
- return tag() | v;
- }
-
- // Helper for sanity checks / debugging.
- static bool isPartition(TaskId id) { return (id & tag()) != 0; }
-};
-
-// Global allocator shared by all split partitions.
-static TaskIdAllocator gSplitTaskIds;
-
static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
TargetMachine *TM, AddStreamFn AddStream,
unsigned ParallelCodeGenParallelismLevel,
@@ -614,37 +625,44 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
static std::mutex PrintMutex;
- SplitModuleCG SplitModuleCG(Mod, CombinedIndex, ParallelCodeGenParallelismLevel);
- ParallelCodeGenParallelismLevel = SplitModuleCG.getPartitionNum();
-
- std::vector<std::string> TempObjectFiles(ParallelCodeGenParallelismLevel);
- std::vector<llvm::FileRemover> TempFileRemovers(ParallelCodeGenParallelismLevel);
+ // Clamp the splitter to the per-task stride budget that getMaxTasks() used to
+ // size the client's output table, so the partition count can never exceed it.
+ // Needed for the "auto" case (no -thinlto-split-partitions), where the
+ // splitter would otherwise pick one partition per call-graph root.
+ unsigned PartitionLimit = ParallelCodeGenParallelismLevel;
+ if (C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
+ (PartitionLimit == 0 || PartitionLimit > C.ThinLTOSplitTaskIdStride))
+ PartitionLimit = C.ThinLTOSplitTaskIdStride;
+
+ SplitModuleCG SplitModuleCG(Mod, CombinedIndex, PartitionLimit);
+ unsigned PartitionCount = SplitModuleCG.getPartitionNum();
+ if (!C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
+ report_fatal_error(
+ "The current LTO client does not support ThinLTO split codegen.");
+ if (C.UseExpandedThinLTOSplitTaskIds) {
+ if (C.ThinLTOSplitTaskIdStride == 0)
+ report_fatal_error(
+ "ThinLTO split codegen expanded task ids require a non-zero stride.");
+ if (PartitionCount > C.ThinLTOSplitTaskIdStride)
+ report_fatal_error(
+ "ThinLTO split codegen produced more partitions than the task id "
+ "stride allows.");
+ }
+ ParallelCodeGenParallelismLevel = PartitionCount;
const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
unsigned PartitionId) {
- unsigned CurrentThreadId, UniqueTaskId;
+ unsigned CurrentThreadId;
{
std::lock_guard<std::mutex> Lock(PrintMutex);
CurrentThreadId = ThreadCount++;
-
- // In distributed ThinLTO, `task` may be a sentinel (e.g. -1 cast to
- // unsigned), which becomes UINT_MAX and naturally has MSB==1. Treat it
- // as "no base task id" and don't enforce the namespace check on it.
- //
- // We do not rely on the incoming `task` for partition uniqueness: split
- // partitions get a dedicated UniqueTaskId allocated below.
- if (task != std::numeric_limits<unsigned>::max()) {
- assert(!TaskIdAllocator::isPartition(task) &&
- "Original ThinLTO TaskId unexpectedly overlaps the partition "
- "namespace");
- }
- UniqueTaskId = gSplitTaskIds.alloc();
}
+ unsigned PartitionTask = getThinLTOOutputTask(C, task, PartitionId);
std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
if (DoOpt) {
- if (!opt(C, ThreadTM.get(), UniqueTaskId, *MPart, /*IsThinLTO=*/true,
+ if (!opt(C, ThreadTM.get(), PartitionTask, *MPart, /*IsThinLTO=*/true,
/*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
CmdArgs, BitcodeLibFuncs)) {
report_fatal_error("Failed to gen opt for split mod in thread.");
@@ -655,10 +673,10 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
// running `opt()`. We're not reaching here as it's bailed out earlier
// with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
if (IRAddStream)
- cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
+ cgdata::saveModuleForTwoRounds(*MPart, PartitionTask,
IRAddStream);
}
-
+
// Rename the GlobalValues whose internal is changed to external. That's
// can avoid duplicate symbols.
auto PromotedRenames = SplitModuleCG.getPromotedRenames();
@@ -669,88 +687,11 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
}
}
- auto splitStream = [&](unsigned task, const Twine &moduleName)
- -> Expected<std::unique_ptr<CachedFileStream>> {
- int FD;
- SmallString<128> TempFilename;
- if (std::error_code EC = sys::fs::createTemporaryFile(
- "thinlto-split", "o", FD, TempFilename))
- return errorCodeToError(EC);
-
- TempObjectFiles[PartitionId] = std::string(TempFilename.str());
- TempFileRemovers[PartitionId].setFile(TempObjectFiles[PartitionId]);
-
- auto OS = std::make_unique<raw_fd_ostream>(
- FD, true, /*CloseOnDestruct*/true);
-
- auto Stream = std::make_unique<CachedFileStream>(
- std::move(OS), std::string(TempFilename.str()));
-
- return std::move(Stream);
- };
-
- codegen(C, ThreadTM.get(), splitStream, UniqueTaskId, *MPart,
- CombinedIndex);
+ codegen(C, ThreadTM.get(), AddStream, PartitionTask, *MPart, CombinedIndex);
};
SplitModuleCG.SplitModule(HandleModulePartition, C);
- // Use ld.lld to combine the partitions into a object.
- if (TempObjectFiles.empty()) {
- llvm::errs() << "TempObjectFiles.empty()\n";
- return true;
- }
-
- auto FinalStream = AddStream(task, Mod.getModuleIdentifier());
- if (!FinalStream)
- report_fatal_error("Failed to open final output stream");
-
- SmallString<128> MergedFilename;
- if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFilename))
- report_fatal_error("Failed to create merged temp file.");
- llvm::FileRemover MergedFileRemover(MergedFilename);
-
- std::vector<StringRef> Args;
- std::string LinkerPath = "";
- if (auto Path = sys::findProgramByName("ld.lld"))
- LinkerPath = *Path;
- else if (auto Path = sys::findProgramByName("ld"))
- LinkerPath = *Path;
-
- if (LinkerPath.empty())
- report_fatal_error("Cannot find linkeer (ld or ld.lld) to merge partitions.");
-
- Args.push_back(LinkerPath);
- Args.push_back("-r");
- Args.push_back("-o");
- Args.push_back(MergedFilename);
-
- for (const auto &File : TempObjectFiles)
- Args.push_back(File);
-
- std::string ErrMsg;
- int Result = sys::ExecuteAndWait(LinkerPath, Args, /*Env=*/std::nullopt,
- /*Redirects=*/{}, /*SecondsToWait=*/0,
- /*MemoryLimit=*/0, &ErrMsg);
-
- if (Result != 0) {
- errs() << "Linker failed: " << ErrMsg << "\n";
- report_fatal_error("Failed to merge split objects.");
- }
-
- {
- std::unique_ptr<CachedFileStream> &FinalFileStream = *FinalStream;
- auto BufferOrErr = MemoryBuffer::getFile(MergedFilename);
- if (!BufferOrErr)
- report_fatal_error("Failed to read merged object.");
-
- FinalFileStream->OS->write(BufferOrErr.get()->getBufferStart(),
- BufferOrErr.get()->getBufferSize());
- if (Error Err = FinalFileStream->commit()) {
- report_fatal_error(Twine("Failed to commit final file stream: ") +
- toString(std::move(Err)));
- }
- }
return true;
}
@@ -939,7 +880,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
else
// If CodeGenOnly is set, we only perform code generation and skip
// optimization. This value may differ from Conf.CodeGenOnly.
- codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
+ codegen(Conf, TM.get(), AddStream, getThinLTOSingleOutputTask(Conf, Task),
+ Mod, CombinedIndex);
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
}
@@ -968,7 +910,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
if (IRAddStream)
cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
- codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+ codegen(Conf, TM, AddStream, getThinLTOSingleOutputTask(Conf, Task),
+ Mod, CombinedIndex);
}
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
};
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index debdddfb79041..ae345924e4074 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -32,6 +32,11 @@ static void externalize(GlobalValue *GV) {
GV->setName("__llvmsplit_unnamed");
}
+static void dealWithDeclareDebugInfo(Module &MPart) {
+ for (Function &F : MPart)
+ if (F.isDeclaration())
+ F.setSubprogram(nullptr);
+}
} // namespace
std::vector<DenseSet<const Function *>> SplitModuleCG::doPartitioning() {
@@ -109,6 +114,7 @@ void SplitModuleCG::calculateFunctionCosts() {
void SplitModuleCG::dealWithMpart(Module &MPart, unsigned I,
function_ref<bool(const GlobalValue *)> NeedsConservativeImport) {
+ dealWithDeclareDebugInfo(MPart);
// collect symbols to rename
auto checkPromoted = [&](const GlobalValue &GV) {
// now is external (not local), but not in external set.
>From b155e8b5fa9b687a2765a0179bfe6a09424f631c Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Thu, 11 Jun 2026 10:15:49 +0800
Subject: [PATCH 6/7] [LTO][SplitModuleCG] Enable split module by callgragh for
FullLTO
- Rename ThinLTOSplit to LTOSplitByCG for clarity
- Add IsThinLTO parameter to splitOptAndCodeGenThin with default true
- Enable splitOptAndCodeGenThin for FullLTO via else if branch
- Adapt partition task calculation for FullLTO non-ThinLTO mode
---
clang/lib/Driver/ToolChains/CommonArgs.cpp | 4 +-
.../split-module-by-cg-fulllto.c | 27 ++++++++
.../thinlto-split/split-output-list-dwo.ll | 6 +-
.../thinlto-split/split-output-list.ll | 4 +-
.../thinlto-split/split-promoted-rename.ll | 2 +-
.../thinlto-split-merge-realistic.ll | 10 +--
clang/test/Driver/thinlto-split-merge.c | 16 ++---
llvm/lib/LTO/LTOBackend.cpp | 67 ++++++++++---------
8 files changed, 82 insertions(+), 54 deletions(-)
create mode 100644 clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e52d8212fd496..23177ba29b734 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -946,9 +946,9 @@ bool tools::isThinLTOSplitEnabled(const ArgList &Args) {
for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
for (size_t I = 0, E = A->getNumValues(); I != E; ++I) {
StringRef V = A->getValue(I);
- if (V == "-thinlto-split" || V == "-thinlto-split=true")
+ if (V == "-lto-split-by-callgraph" || V == "-lto-split-by-callgraph=true")
Enabled = true;
- else if (V == "-thinlto-split=false")
+ else if (V == "-lto-split-by-callgraph=false")
Enabled = false;
}
}
diff --git a/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c b/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c
new file mode 100644
index 0000000000000..c24b0ddbeac24
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c
@@ -0,0 +1,27 @@
+// REQUIRES: aarch64-registered-target
+// Test that FullLTO with callgraph-based module splitting generates
+// multiple partitions with consistent symbol renaming.
+
+// RUN: %clang -flto=full -fuse-ld=lld -shared \
+// RUN: -o %t.o %s \
+// RUN: -Wl,-mllvm,-lto-split-by-callgraph=true \
+// RUN: -Wl,--lto-partitions=2 \
+// RUN: -Wl,--save-temps=prelink
+// RUN: llvm-nm %t.o.lto.o | FileCheck %s --check-prefix=CHECK0
+// RUN: llvm-nm %t.o.lto.1.o | FileCheck %s --check-prefix=CHECK1
+
+// CHECK0-DAG: T caller_b
+// CHECK0-DAG: T promoted_internal
+
+// CHECK1-DAG: T caller_a
+// CHECK1-DAG: U promoted_internal
+
+static void promoted_internal(void) {}
+
+void caller_a(void) {
+ promoted_internal();
+}
+
+void caller_b(void) {
+ promoted_internal();
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
index e3064684e542a..2af7ceb72bbca 100644
--- a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
+++ b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
@@ -30,7 +30,7 @@
; RUN: -split-dwarf-output %t.split.o.dwo \
; RUN: -o %t.split.o -x ir %t.o \
; RUN: -debug-info-kind=constructor -dwarf-version=5 \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -102,7 +102,7 @@
; RUN: -split-dwarf-file %t.single.o.dwo \
; RUN: -o %t.single.o -x ir %t.o \
; RUN: -debug-info-kind=constructor -dwarf-version=5 \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -127,7 +127,7 @@
; RUN: -split-dwarf-output %t.nosplit.o.dwo \
; RUN: -o %t.nosplit.o -x ir %t.o \
; RUN: -debug-info-kind=constructor -dwarf-version=5 \
-; RUN: -mllvm -thinlto-split=false
+; RUN: -mllvm -lto-split-by-callgraph=false
; RUN: ls %t.nosplit.o.dwo
; RUN: not ls %t.nosplit.o.thinlto-split.0.dwo 2>/dev/null
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list.ll b/clang/test/CodeGen/thinlto-split/split-output-list.ll
index 88abd8dd87019..a9bbe074ef345 100644
--- a/clang/test/CodeGen/thinlto-split/split-output-list.ll
+++ b/clang/test/CodeGen/thinlto-split/split-output-list.ll
@@ -9,7 +9,7 @@
; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
; RUN: -o %t.split.o -x ir %t.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 \
@@ -21,7 +21,7 @@
; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
; RUN: -o %t.skip.o -x ir %t.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -thinlto-split-output-list=%t.skip.rsp
; RUN: FileCheck %s --check-prefix=SKIP-RSP --input-file=%t.skip.rsp
diff --git a/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
index 2cf3aa41a9c75..19274c7c7ee72 100644
--- a/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
+++ b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
@@ -11,7 +11,7 @@
; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
; RUN: -o %t.split.o -x ir %t.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=1 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -thinlto-split-output-list=%t.split.rsp
diff --git a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
index c024e4c37f2fb..94c0beee69e23 100644
--- a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
+++ b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
@@ -33,7 +33,7 @@
; RUN: -emit-obj -fthinlto-index=%t.o.thinlto.bc \
; RUN: -thinlto-split-output-list=%t.cc1.rsp \
; RUN: -o %t.cc1.o -x ir %t.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -76,7 +76,7 @@
; RUN: %clang -### -target aarch64-unknown-linux-gnu \
; RUN: -B%S/Inputs/lld \
; RUN: -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.driver.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=DRIVER
@@ -90,7 +90,7 @@
; RUN: %clang -### -target aarch64-unknown-linux-gnu \
; RUN: -B%S/Inputs/lld \
; RUN: -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.nosplit_driver.o \
-; RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
+; RUN: -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
; NOSPLIT-DRIVER: "-cc1"
; NOSPLIT-DRIVER-NOT: thinlto-split-output-list
@@ -104,7 +104,7 @@
; RUN: %clang -### -target aarch64-unknown-linux-gnu \
; RUN: -B%S/Inputs/lld \
; RUN: -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save.o \
-; RUN: -mllvm -thinlto-split=true \
+; RUN: -mllvm -lto-split-by-callgraph=true \
; RUN: -mllvm -thinlto-split-partitions=2 \
; RUN: -mllvm -thinlto-split-module-size-threshold=0 \
; RUN: -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
@@ -118,7 +118,7 @@
; Verify ordinary -save-temps without split still uses -S (not collapsed)
; RUN: %clang -### -target aarch64-unknown-linux-gnu \
; RUN: -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save_nosplit.o \
-; RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
+; RUN: -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
; SAVE-TEMPS-NOSPLIT: "-cc1"
; SAVE-TEMPS-NOSPLIT-SAME: "-S"
diff --git a/clang/test/Driver/thinlto-split-merge.c b/clang/test/Driver/thinlto-split-merge.c
index eb062754729ee..68a0b292964c6 100644
--- a/clang/test/Driver/thinlto-split-merge.c
+++ b/clang/test/Driver/thinlto-split-merge.c
@@ -1,31 +1,31 @@
// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true \
+// RUN: -mllvm -lto-split-by-callgraph=true \
// RUN: -mllvm -thinlto-split-partitions=2 2>&1 | FileCheck %s --check-prefix=MERGE
// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=LLD
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=LLD
// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
+// RUN: -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
// RUN: -save-temps -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
// RUN: rm -rf %t.empty
// RUN: mkdir -p %t.empty
// RUN: not env PATH= %clang -### -ccc-install-dir %t.empty \
// RUN: -target aarch64-unknown-linux-gnu \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
// RUN: %clang -### -target x86_64-unknown-freebsd \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
// RUN: %clang -### -target x86_64-unknown-fuchsia \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
// RUN: %clang -### -target x86_64-none-elf \
// RUN: -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN: -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
+// RUN: -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
// MERGE: "-cc1"
// MERGE-SAME: "-fthinlto-index=foo.thinlto.bc"
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index c65187c838bb6..71ea0c5125755 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -93,25 +93,25 @@ static cl::opt<float> ThinLTOSplitModuleSizeRateThreshold(
cl::desc("Whether to split in thinlto backend based on the ratio of "
"(callgraph size)/(module size)"));
-static cl::opt<unsigned> ThinLTOSplitPartitions(
+static cl::opt<unsigned> LTOSplitPartitions(
"thinlto-split-partitions", cl::Hidden, cl::init(0),
cl::desc("Control split to how many partitions in thinlto backend."));
-static cl::opt<bool> ThinLTOSplit("thinlto-split", cl::init(false),
+static cl::opt<bool> LTOSplitByCG("lto-split-by-callgraph", cl::init(false),
cl::desc("Enable split module in thinlto backend."));
namespace llvm {
extern cl::opt<bool> NoPGOWarnMismatch;
}
-bool lto::isThinLTOSplitEnabled() { return ThinLTOSplit; }
+bool lto::isThinLTOSplitEnabled() { return LTOSplitByCG; }
unsigned lto::getThinLTOSplitMaxPartitions() {
- if (!ThinLTOSplit)
+ if (!LTOSplitByCG)
return 1;
- // ThinLTOSplitPartitions==0 means "decide per module from the call graph",
+ // LTOSplitPartitions==0 means "decide per module from the call graph",
// so there is no static upper bound the client can rely on.
- return ThinLTOSplitPartitions;
+ return LTOSplitPartitions;
}
static unsigned getThinLTOOutputTask(const Config &C, unsigned LogicalTask,
@@ -190,7 +190,7 @@ Error Config::addSaveTemps(std::string OutputFileName, bool UseInputModulePath,
// named from the provided OutputFileName with the Task ID appended.
if (M.getModuleIdentifier() == "ld-temp.o" || !UseInputModulePath) {
PathPrefix = OutputFileName;
- if (ThinLTOSplit)
+ if (LTOSplitByCG)
PathPrefix += extract_filename(M.getSourceFileName()) + ".";
if (Task != (unsigned)-1)
PathPrefix += utostr(Task) + ".";
@@ -619,27 +619,26 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
const ModuleSummaryIndex &CombinedIndex,
const std::vector<uint8_t> &CmdArgs,
bool DoOpt, AddStreamFn IRAddStream,
- ArrayRef<StringRef> &BitcodeLibFuncs) {
+ ArrayRef<StringRef> &BitcodeLibFuncs,
+ bool IsThinLTO = true) {
unsigned ThreadCount = 0;
const Target *T = &TM->getTarget();
-
- static std::mutex PrintMutex;
-
// Clamp the splitter to the per-task stride budget that getMaxTasks() used to
// size the client's output table, so the partition count can never exceed it.
// Needed for the "auto" case (no -thinlto-split-partitions), where the
// splitter would otherwise pick one partition per call-graph root.
unsigned PartitionLimit = ParallelCodeGenParallelismLevel;
- if (C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
+ if (IsThinLTO &&
+ C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
(PartitionLimit == 0 || PartitionLimit > C.ThinLTOSplitTaskIdStride))
PartitionLimit = C.ThinLTOSplitTaskIdStride;
SplitModuleCG SplitModuleCG(Mod, CombinedIndex, PartitionLimit);
unsigned PartitionCount = SplitModuleCG.getPartitionNum();
- if (!C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
+ if (IsThinLTO && !C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
report_fatal_error(
"The current LTO client does not support ThinLTO split codegen.");
- if (C.UseExpandedThinLTOSplitTaskIds) {
+ if (IsThinLTO && C.UseExpandedThinLTOSplitTaskIds) {
if (C.ThinLTOSplitTaskIdStride == 0)
report_fatal_error(
"ThinLTO split codegen expanded task ids require a non-zero stride.");
@@ -652,13 +651,8 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
unsigned PartitionId) {
- unsigned CurrentThreadId;
- {
- std::lock_guard<std::mutex> Lock(PrintMutex);
- CurrentThreadId = ThreadCount++;
- }
-
- unsigned PartitionTask = getThinLTOOutputTask(C, task, PartitionId);
+ unsigned PartitionTask = IsThinLTO ?
+ getThinLTOOutputTask(C, task, PartitionId) : PartitionId;
std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
if (DoOpt) {
@@ -677,13 +671,15 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
IRAddStream);
}
- // Rename the GlobalValues whose internal is changed to external. That's
- // can avoid duplicate symbols.
- auto PromotedRenames = SplitModuleCG.getPromotedRenames();
- for (auto &GV : MPart->global_values()) {
- if (auto It = PromotedRenames.find(GV.getName());
- It != PromotedRenames.end()) {
- GV.setName(It->second);
+ if (IsThinLTO) {
+ // Rename the GlobalValues whose internal is changed to external. That's
+ // can avoid duplicate symbols int ThinLTO.
+ auto PromotedRenames = SplitModuleCG.getPromotedRenames();
+ for (auto &GV : MPart->global_values()) {
+ if (auto It = PromotedRenames.find(GV.getName());
+ It != PromotedRenames.end()) {
+ GV.setName(It->second);
+ }
}
}
@@ -796,6 +792,11 @@ Error lto::backend(const Config &C, AddStreamFn AddStream,
if (ParallelCodeGenParallelismLevel == 1) {
codegen(C, TM.get(), AddStream, 0, Mod, CombinedIndex);
+ } else if (LTOSplitByCG) {
+ splitOptAndCodeGenThin(/*Task*/0, C, TM.get(), AddStream,
+ ParallelCodeGenParallelismLevel, Mod, CombinedIndex,
+ /*CmdArgs*/ std::vector<uint8_t>(), /*DoOpt*/false,
+ AddStreamFn(), BitcodeLibFuncs, false);
} else {
splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel, Mod,
CombinedIndex);
@@ -860,7 +861,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
Mod.setPartialSampleProfileRatio(CombinedIndex);
bool ProfitableToSplit = true;
- if (ThinLTOSplit) {
+ if (LTOSplitByCG) {
if (!canDoSplitModule(Mod) || !HasLargeCG(Mod, CombinedIndex)) {
ProfitableToSplit = false;
LLVM_DEBUG(dbgs() << "warning: thinlto split not enable for module: "
@@ -873,9 +874,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
if (CodeGenOnly) {
- if (ThinLTOSplit && ProfitableToSplit)
+ if (LTOSplitByCG && ProfitableToSplit)
splitOptAndCodeGenThin(Task, Conf, TM.get(), AddStream,
- ThinLTOSplitPartitions, Mod, CombinedIndex,
+ LTOSplitPartitions, Mod, CombinedIndex,
CmdArgs, false, IRAddStream, BitcodeLibFuncs);
else
// If CodeGenOnly is set, we only perform code generation and skip
@@ -891,9 +892,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
auto OptimizeAndCodegen =
[&](Module &Mod, TargetMachine *TM,
LLVMRemarkFileHandle DiagnosticOutputFile) {
- if (ThinLTOSplit && ProfitableToSplit) {
+ if (LTOSplitByCG && ProfitableToSplit) {
if (!splitOptAndCodeGenThin(
- Task, Conf, TM, AddStream, ThinLTOSplitPartitions, Mod,
+ Task, Conf, TM, AddStream, LTOSplitPartitions, Mod,
CombinedIndex, CmdArgs, true, IRAddStream, BitcodeLibFuncs))
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
} else {
>From 7e46545d750b787464b2831281c738d8daa89206 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Fri, 12 Jun 2026 15:17:07 +0800
Subject: [PATCH 7/7] [SplitModuleCG] Fix warning errors
- Remove unused variable.
- Fix constructor initialization order to match class
declaration order (N, M, CG).
---
llvm/include/llvm/Transforms/Utils/SplitModuleCG.h | 6 ++----
llvm/lib/LTO/LTOBackend.cpp | 1 -
llvm/lib/Transforms/Utils/SplitModuleCG.cpp | 2 +-
3 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
index 956a1ea8030fe..9836376b94a82 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
@@ -26,7 +26,7 @@ class SimplifyCallGraph {
explicit SimplifyCallGraph(CallGraph &CG,
const ModuleSummaryIndex &CombinedIndex,
Module &M)
- : CG(CG), M(M) {
+ : CG(CG) {
createSimplifyCallGraph(CombinedIndex);
}
~SimplifyCallGraph() {};
@@ -74,14 +74,13 @@ class SimplifyCallGraph {
private:
CallGraph &CG;
- Module &M;
};
class SimplifyCallGraphNode {
public:
using CalledFunctionsSet = DenseSet<SimplifyCallGraphNode *>;
inline SimplifyCallGraphNode(SimplifyCallGraph *SCG, Function *F)
- : SCG(SCG), F(F) {}
+ : F(F) {}
SimplifyCallGraphNode(const SimplifyCallGraphNode &) = delete;
SimplifyCallGraphNode &operator=(const SimplifyCallGraphNode &) = delete;
@@ -118,7 +117,6 @@ class SimplifyCallGraphNode {
private:
friend class SimplifyCallGraph;
- SimplifyCallGraph *SCG;
Function *F;
DenseSet<SimplifyCallGraphNode *> CalledFunctions;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 71ea0c5125755..97c4018f13a64 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -621,7 +621,6 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
bool DoOpt, AddStreamFn IRAddStream,
ArrayRef<StringRef> &BitcodeLibFuncs,
bool IsThinLTO = true) {
- unsigned ThreadCount = 0;
const Target *T = &TM->getTarget();
// Clamp the splitter to the per-task stride budget that getMaxTasks() used to
// size the client's output table, so the partition count can never exceed it.
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index ae345924e4074..e2ec77bf61bd4 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -305,7 +305,7 @@ void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
SplitModuleCG::SplitModuleCG(Module &M,
const ModuleSummaryIndex &CombinedIndex,
unsigned LimitPartition)
- : M(M), CG(M), N(LimitPartition) {
+ : N(LimitPartition), M(M), CG(M) {
// Track existing non-local symbols. This ensures that when we promote
// internal symbols to external for partitioning, we can handle renaming
// and avoid conflicts.
More information about the cfe-commits
mailing list