[clang] [lld] [llvm] [ThinLTO][Split] Split module for parallel compilation in backend (1/N) (PR #198702)

via cfe-commits cfe-commits at lists.llvm.org
Fri Jun 12 00:22:00 PDT 2026


https://github.com/mmjjpp updated https://github.com/llvm/llvm-project/pull/198702

>From d9fc0bd6f1d5767f87eb515a665e76f6d00d1ea4 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 11:22:30 +0800
Subject: [PATCH 1/7] [ThinLTO][Split] Split module for parallel compilation in
 backend

An interface for splitting a module by callgraph is added. This
interface is called in the thinlto backend phase. The module is
split into N Mparts, and opt and codegen are performed on the
Mparts in parallel to implement parallel compilation in the
thinlto backend.
---
 .../llvm/Transforms/Utils/SplitModuleCG.h     |  34 ++
 llvm/lib/LTO/LTOBackend.cpp                   | 292 +++++++++++++++++-
 llvm/lib/Transforms/Utils/CMakeLists.txt      |   1 +
 llvm/lib/Transforms/Utils/SplitModuleCG.cpp   |  26 ++
 4 files changed, 336 insertions(+), 17 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
 create mode 100644 llvm/lib/Transforms/Utils/SplitModuleCG.cpp

diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
new file mode 100644
index 0000000000000..e60c4e931d40c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
@@ -0,0 +1,34 @@
+#ifndef LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
+#define LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/LTO/Config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace llvm {
+/// Splits the module M into N linkable partitions. The function ModuleCallback
+/// is called N times passing each individual partition as the MPart argument.
+class SplitModuleCG {
+public:
+  using ModuleCreationCallback =
+      function_ref<void(std::unique_ptr<Module> MPart, unsigned PartitionId)>;
+  SplitModuleCG(Module &M,
+                const ModuleSummaryIndex &CombinedIndex,
+                unsigned LimitPartition = 0);
+  void SplitModule(ModuleCreationCallback ModuleCallback,
+                   const llvm::lto::Config &C);
+
+  unsigned getPartitionNum() { return N; }
+
+  private:
+  unsigned N;
+  Module &M;
+  CallGraph CG;
+  DenseSet<const Function *> EntryFuncs;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 73697a9d0d446..11200ade0e8c0 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -34,8 +34,10 @@
 #include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -45,6 +47,8 @@
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
+#include <filesystem>
 #include <optional>
 
 using namespace llvm;
@@ -80,6 +84,23 @@ static cl::list<std::string>
                              "path matches this for -save-temps options"),
                     cl::CommaSeparated, cl::Hidden);
 
+static cl::opt<unsigned> ThinLTOSplitModuleSizeThreshold(
+    "thinlto-split-module-size-threshold", cl::Hidden, cl::init(500),
+    cl::desc("Control the amount of whether split in thinlto backend"
+             "accroding to the size of a module."));
+
+static cl::opt<float> ThinLTOSplitModuleSizeRateThreshold(
+    "thinlto-split-module-size-rate-threshold", cl::Hidden, cl::init(0.5),
+    cl::desc("Whether to split in thinlto backend based on the ratio of "
+             "(callgraph size)/(module size)"));
+
+static cl::opt<unsigned> ThinLTOSplitPartitions(
+    "thinlto-split-partitions", cl::Hidden, cl::init(0),
+    cl::desc("Control split to how many partitions in thinlto backend."));
+
+static cl::opt<bool> ThinLTOSplit("thinlto-split", cl::init(false),
+			   cl::desc("Enable split module in thinlto backend."));
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -124,12 +145,19 @@ Error Config::addSaveTemps(std::string OutputFileName, bool UseInputModulePath,
       if (LinkerHook && !LinkerHook(Task, M))
         return false;
 
+      auto extract_filename = [](const std::string &path) -> std::string {
+        std::filesystem::path fs_path(path);
+        return fs_path.filename().string();
+      };
+
       std::string PathPrefix;
       // If this is the combined module (not a ThinLTO backend compile) or the
       // user hasn't requested using the input module's path, emit to a file
       // named from the provided OutputFileName with the Task ID appended.
       if (M.getModuleIdentifier() == "ld-temp.o" || !UseInputModulePath) {
         PathPrefix = OutputFileName;
+        if (ThinLTOSplit)
+          PathPrefix += extract_filename(M.getSourceFileName()) + ".";
         if (Task != (unsigned)-1)
           PathPrefix += utostr(Task) + ".";
       } else
@@ -513,6 +541,212 @@ static void codegen(const Config &Conf, TargetMachine *TM,
     report_fatal_error(std::move(Err));
 }
 
+static unsigned calFunctionSize(const llvm::Function &F) {
+  unsigned size = 0;
+  for (const auto &BB : F)
+    size += std::distance(BB.begin(), BB.end());
+  return size;
+}
+
+static unsigned calModuleSize(const llvm::Module &M) {
+  unsigned size = 0;
+  for (const auto &F : M)
+    size += calFunctionSize(F);
+  return size;
+}
+
+static bool canDoSplitModule(const llvm::Module &M) {
+  if (calModuleSize(M) < ThinLTOSplitModuleSizeThreshold)
+    return false;
+  return true;
+}
+
+static bool HasLargeCG(Module &Mod, const ModuleSummaryIndex &CombinedIndex) {
+  // TODO: Check whether there has large callgraphs. When multiple callgraphs
+  // are split, thinlto parallel compilation can bring benefits.
+  return true;
+}
+
+struct TaskIdAllocator {
+  using TaskId = unsigned;
+
+  // Use the most significant bit (MSB) as a namespace tag.
+  // - Original ThinLTO backend tasks are expected to have MSB == 0.
+  // - Split partitions allocated by this allocator always have MSB == 1.
+  // This guarantees the two ID spaces never overlap.
+  static constexpr TaskId tag() {
+    return TaskId{1} << (std::numeric_limits<TaskId>::digits - 1);
+  }
+
+  // Monotonic sequence counter for split partitions (MSB must remain 0 here).
+  std::atomic<TaskId> seq{0};
+
+  // Allocate a globally unique TaskId for a split partition.
+  // The returned ID is `tag() | seq`, so it lives in the MSB==1 namespace.
+  TaskId alloc() {
+    TaskId v = seq.fetch_add(1, std::memory_order_relaxed);
+
+    // If the counter ever reaches the MSB, we'd overlap namespaces.
+    // This indicates an overflow / too many partitions.
+    if (v & tag())
+      report_fatal_error("Partition TaskId overflow: seq reached the tag bit.");
+
+    return tag() | v;
+  }
+
+  // Helper for sanity checks / debugging.
+  static bool isPartition(TaskId id) { return (id & tag()) != 0; }
+};
+
+// Global allocator shared by all split partitions.
+static TaskIdAllocator gSplitTaskIds;
+
+static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
+                                   TargetMachine *TM, AddStreamFn AddStream,
+                                   unsigned ParallelCodeGenParallelismLevel,
+                                   Module &Mod,
+                                   const ModuleSummaryIndex &CombinedIndex,
+                                   const std::vector<uint8_t> &CmdArgs,
+                                   bool DoOpt, AddStreamFn IRAddStream,
+                                   ArrayRef<StringRef> &BitcodeLibFuncs) {
+  unsigned ThreadCount = 0;
+  const Target *T = &TM->getTarget();
+
+  static std::mutex PrintMutex;
+
+  SplitModuleCG SplitModuleCG(Mod, CombinedIndex, ParallelCodeGenParallelismLevel);
+  ParallelCodeGenParallelismLevel = SplitModuleCG.getPartitionNum();
+
+  std::vector<std::string> TempObjectFiles(ParallelCodeGenParallelismLevel);
+  std::vector<llvm::FileRemover> TempFileRemovers(ParallelCodeGenParallelismLevel);
+
+  const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
+                                         unsigned PartitionId) {
+    unsigned CurrentThreadId, UniqueTaskId;
+    {
+      std::lock_guard<std::mutex> Lock(PrintMutex);
+      CurrentThreadId = ThreadCount++;
+
+      // In distributed ThinLTO, `task` may be a sentinel (e.g. -1 cast to
+      // unsigned), which becomes UINT_MAX and naturally has MSB==1. Treat it
+      // as "no base task id" and don't enforce the namespace check on it.
+      //
+      // We do not rely on the incoming `task` for partition uniqueness: split
+      // partitions get a dedicated UniqueTaskId allocated below.
+      if (task != std::numeric_limits<unsigned>::max()) {
+        assert(!TaskIdAllocator::isPartition(task) &&
+               "Original ThinLTO TaskId unexpectedly overlaps the partition "
+               "namespace");
+      }
+      UniqueTaskId = gSplitTaskIds.alloc();
+    }
+
+    std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
+
+    if (DoOpt) {
+      if (!opt(C, ThreadTM.get(), UniqueTaskId, *MPart, /*IsThinLTO=*/true,
+               /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
+               CmdArgs, BitcodeLibFuncs)) {
+        report_fatal_error("Failed to gen opt for split mod in thread.");
+      }
+
+      // Save the current module before the first codegen round.
+      // Note that the second codegen round runs only `codegen()` without
+      // running `opt()`. We're not reaching here as it's bailed out earlier
+      // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
+      if (IRAddStream)
+        cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
+                                       IRAddStream);
+    }
+
+    auto splitStream = [&](unsigned task, const Twine &moduleName)
+        -> Expected<std::unique_ptr<CachedFileStream>> {
+      int FD;
+      SmallString<128> TempFilename;
+      if (std::error_code EC = sys::fs::createTemporaryFile(
+              "thinlto-split", "o", FD, TempFilename))
+        return errorCodeToError(EC);
+
+      TempObjectFiles[PartitionId] = std::string(TempFilename.str());
+      TempFileRemovers[PartitionId].setFile(TempObjectFiles[PartitionId]);
+
+      auto OS = std::make_unique<raw_fd_ostream>(
+          FD, true, /*CloseOnDestruct*/true);
+
+      auto Stream = std::make_unique<CachedFileStream>(
+          std::move(OS), std::string(TempFilename.str()));
+
+      return std::move(Stream);
+    };
+
+    codegen(C, ThreadTM.get(), splitStream, UniqueTaskId, *MPart,
+            CombinedIndex);
+  };
+
+  SplitModuleCG.SplitModule(HandleModulePartition, C);
+
+  // Use ld.lld to combine the partitions into a object.
+  if (TempObjectFiles.empty()) {
+    llvm::errs() << "TempObjectFiles.empty()\n";
+    return true;
+  }
+
+  auto FinalStream = AddStream(task, Mod.getModuleIdentifier());
+  if (!FinalStream)
+    report_fatal_error("Failed to open final output stream");
+
+  int MergedFD;
+  SmallString<128> MergedFilename;
+  if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFD,
+                                   MergedFilename))
+    report_fatal_error("Failed to create merged temp file.");
+  llvm::FileRemover MergedFileRemover(MergedFilename);
+  sys::fs::closeFile(MergedFD);
+
+  std::vector<StringRef> Args;
+  std::string LinkerPath = "";
+  if (auto Path = sys::findProgramByName("ld.lld"))
+    LinkerPath = *Path;
+  else if (auto Path = sys::findProgramByName("ld"))
+    LinkerPath = *Path;
+
+  if (LinkerPath.empty())
+    report_fatal_error("Cannot find linkeer (ld or ld.lld) to merge partitions.");
+
+  Args.push_back(LinkerPath);
+  Args.push_back("-r");
+  Args.push_back("-o");
+  Args.push_back(MergedFilename);
+
+  for (const auto &File : TempObjectFiles)
+    Args.push_back(File);
+
+  std::string ErrMsg;
+  int Result = sys::ExecuteAndWait(LinkerPath, Args, /*Env=*/std::nullopt,
+                                   /*Redirects=*/{}, /*SecondsToWait=*/0,
+                                   /*MemoryLimit=*/0, &ErrMsg);
+
+  if (Result != 0) {
+    errs() << "Linker failed: " << ErrMsg << "\n";
+    report_fatal_error("Failed to merge split objects.");
+  }
+
+  {
+    std::unique_ptr<CachedFileStream> &FinalFileStream = *FinalStream;
+    auto BufferOrErr = MemoryBuffer::getFile(MergedFilename);
+    if (!BufferOrErr)
+      report_fatal_error("Failed to read merged object.");
+
+    FinalFileStream->OS->write(BufferOrErr.get()->getBufferStart(),
+                               BufferOrErr.get()->getBufferSize());
+    if (Error Err = FinalFileStream->commit()) {
+      report_fatal_error(Twine("Failed to commit final file stream: ") +
+                         toString(std::move(Err)));
+    }
+  }
+  return true;
+}
+
 static void splitCodeGen(const Config &C, TargetMachine *TM,
                          AddStreamFn AddStream,
                          unsigned ParallelCodeGenParallelismLevel, Module &Mod,
@@ -677,11 +911,28 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   // the module, if applicable.
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
+  bool ProfitableToSplit = true;
+  if (ThinLTOSplit) {
+    if (!canDoSplitModule(Mod) || !HasLargeCG(Mod, CombinedIndex)) {
+      ProfitableToSplit = false;
+      LLVM_DEBUG(dbgs() << "warning: thinlto split not enable for module: "
+                        << Mod.getName());
+    } else {
+      LLVM_DEBUG(dbgs() << "thinlto: split codegen for module: "
+                        << Mod.getName());
+    }
+  }
+
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
   if (CodeGenOnly) {
-    // If CodeGenOnly is set, we only perform code generation and skip
-    // optimization. This value may differ from Conf.CodeGenOnly.
-    codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
+    if (ThinLTOSplit && ProfitableToSplit)
+      splitOptAndCodeGenThin(Task, Conf, TM.get(), AddStream,
+                             ThinLTOSplitPartitions, Mod, CombinedIndex,
+                             CmdArgs, false, IRAddStream, BitcodeLibFuncs);
+    else
+      // If CodeGenOnly is set, we only perform code generation and skip
+      // optimization. This value may differ from Conf.CodeGenOnly.
+      codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
@@ -691,20 +942,27 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           LLVMRemarkFileHandle DiagnosticOutputFile) {
-        // Perform optimization and code generation for ThinLTO.
-        if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
-                 /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
-                 CmdArgs, BitcodeLibFuncs))
-          return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
-
-        // Save the current module before the first codegen round.
-        // Note that the second codegen round runs only `codegen()` without
-        // running `opt()`. We're not reaching here as it's bailed out earlier
-        // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
-        if (IRAddStream)
-          cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
-
-        codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+        if (ThinLTOSplit && ProfitableToSplit) {
+          if (!splitOptAndCodeGenThin(
+                  Task, Conf, TM, AddStream, ThinLTOSplitPartitions, Mod,
+                  CombinedIndex, CmdArgs, true, IRAddStream, BitcodeLibFuncs))
+            return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+        } else {
+          // Perform optimization and code generation for ThinLTO.
+          if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
+                  /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
+                  CmdArgs, BitcodeLibFuncs))
+            return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+
+          // Save the current module before the first codegen round.
+          // Note that the second codegen round runs only `codegen()` without
+          // running `opt()`. We're not reaching here as it's bailed out earlier
+          // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
+          if (IRAddStream)
+            cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
+
+          codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+        }
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
 
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 8fe0476ab1a32..01b44ae2cfa29 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_component_library(LLVMTransformUtils
   SizeOpts.cpp
   SplitModule.cpp
   SplitModuleByCategory.cpp
+  SplitModuleCG.cpp
   StripNonLineTableDebugInfo.cpp
   SymbolRewriter.cpp
   UnifyFunctionExitNodes.cpp
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
new file mode 100644
index 0000000000000..9f57cb3ed566e
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -0,0 +1,26 @@
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "split-module-CG"
+
+void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
+                                const llvm::lto::Config &C) {
+  // TODO: 1. Process the linkage of the GlobalValue; 2. Allocate the callgraph
+  // to N partitions; 3.Invoke the cloneModule API to copy the N partitions to
+  // obtain MParts.
+
+}
+
+SplitModuleCG::SplitModuleCG(Module &M,
+                             const ModuleSummaryIndex &CombinedIndex,
+                             unsigned LimitPartition)
+    : M(M), CG(M), N(LimitPartition) {
+  // TODO: The module is split based on the callgraph, and EntryFuncs stores
+  // the root function of each callgraph.
+
+  if (N == 0 || N > EntryFuncs.size()) {
+    N = EntryFuncs.size();
+  }
+  N = N == 0 ? 1 : N;
+}

>From b80904856475a12f4c6a010c1730324406d7b595 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 15:27:29 +0800
Subject: [PATCH 2/7] [Thinlto][Split] Add callgraph-based module
 splitting(SplitModuleCG)

Add a new SplitModuleCG that partitions a module into multiple
parts using function callgraph traversal and cost-based load balancing.
This is intended for use in thinLTO to parallelize code generation by
splitting the module while preserving function call dependencies.

Key features:
- Build a simplified callgraph to track function calls and roots
- Calculate function costs based on IR instruction count
- Partition functions with balanced cost distribution
- Externalize local symbols and rename promoted symbols to avoid
  conflicts
- Clone module partitions and emit them in parallel
---
 .../llvm/Transforms/Utils/SplitModuleCG.h     | 182 ++++++++-
 llvm/lib/LTO/LTOBackend.cpp                   |  10 +
 llvm/lib/Transforms/Utils/SplitModuleCG.cpp   | 367 +++++++++++++++++-
 3 files changed, 552 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
index e60c4e931d40c..956a1ea8030fe 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleCG.h
@@ -1,6 +1,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
 #define LLVM_TRANSFORMS_UTILS_SPLITMODULECG_H
 
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/LTO/Config.h"
@@ -8,6 +9,169 @@
 #include "llvm/ADT/DenseSet.h"
 
 namespace llvm {
+
+class SimplifyCallGraph;
+class SimplifyCallGraphNode;
+
+using CostType = InstructionCost::CostType;
+
+class SimplifyCallGraph {
+  using FunctionMapTy =
+      std::map<const Function *, std::unique_ptr<SimplifyCallGraphNode>>;
+
+  /// A map from \c Function* to \c SimplifyCallGraphNode*.
+  FunctionMapTy FunctionMap;
+
+public:
+  explicit SimplifyCallGraph(CallGraph &CG,
+                             const ModuleSummaryIndex &CombinedIndex,
+                             Module &M)
+      : CG(CG), M(M) {
+    createSimplifyCallGraph(CombinedIndex);
+  }
+  ~SimplifyCallGraph() {};
+
+  using iterator = FunctionMapTy::iterator;
+  using const_iterator = FunctionMapTy::const_iterator;
+
+  /// Returns the module the call graph corresponds to.
+  inline iterator begin() { return FunctionMap.begin(); }
+  inline iterator end() { return FunctionMap.end(); }
+  inline const_iterator begin() const { return FunctionMap.begin(); }
+  inline const_iterator end() const { return FunctionMap.end(); }
+
+  /// Returns the call graph node for the provided function.
+  inline const SimplifyCallGraphNode *operator[](const Function *F) const {
+    const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in callgraph!");
+    return I->second.get();
+  }
+
+  /// Returns the call graph node for the provided function.
+  inline SimplifyCallGraphNode *operator[](const Function *F) {
+    const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in callgraph!");
+    return I->second.get(); 
+  }
+
+  /// Returns the call graph node for the provided function.
+  inline const SimplifyCallGraphNode *at(const Function *F) const {
+    const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in callgraph!");
+    return I->second.get();
+  }
+
+  /// Returns the call graph node for the provided function.
+  inline SimplifyCallGraphNode *at(const Function *F) {
+    const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in callgraph!");
+    return I->second.get();
+  }
+
+  void createSimplifyCallGraph(const ModuleSummaryIndex &CombinedIndex);
+  void print();
+  SimplifyCallGraphNode *getOrInsertFunction(const Function *F);
+
+private:
+  CallGraph &CG;
+  Module &M;
+};
+
+class SimplifyCallGraphNode {
+public:
+  using CalledFunctionsSet = DenseSet<SimplifyCallGraphNode *>;
+  inline SimplifyCallGraphNode(SimplifyCallGraph *SCG, Function *F)
+      : SCG(SCG), F(F) {}
+
+  SimplifyCallGraphNode(const SimplifyCallGraphNode &) = delete;
+  SimplifyCallGraphNode &operator=(const SimplifyCallGraphNode &) = delete;
+
+  ~SimplifyCallGraphNode() {}
+
+  Function *getFunction() const { return F; }
+
+  unsigned getNumReferences() const { return NumReferences; }
+
+  using iterator = DenseSet<SimplifyCallGraphNode *>::iterator;
+  using const_iterator = DenseSet<SimplifyCallGraphNode *>::const_iterator;
+
+  inline iterator begin() { return CalledFunctions.begin(); }
+  inline iterator end() { return CalledFunctions.end(); }
+  inline const_iterator begin() const { return CalledFunctions.begin(); }
+  inline const_iterator end() const { return CalledFunctions.end(); }
+  inline size_t count(SimplifyCallGraphNode * SCGNode) { return CalledFunctions.count(SCGNode); }
+  inline bool empty() const { return CalledFunctions.empty(); }
+  inline unsigned size() const { return (unsigned)CalledFunctions.size(); }
+
+  void addCalledFunction(SimplifyCallGraphNode *Called) {
+    auto [It, Inserted] = CalledFunctions.insert(Called);
+    if (Inserted)
+      Called->AddRef();
+  }
+
+  void removeCalledFunction(SimplifyCallGraphNode *Called) {
+    auto NumRemoved = CalledFunctions.erase(Called);
+    if (NumRemoved > 0)
+      Called->DropRef();
+  }
+
+private:
+  friend class SimplifyCallGraph;
+
+  SimplifyCallGraph *SCG;
+  Function *F;
+
+  DenseSet<SimplifyCallGraphNode *> CalledFunctions;
+  unsigned NumReferences = 0;
+
+  void DropRef() { --NumReferences; }
+  void AddRef() { ++NumReferences; }
+};
+
+static void addAllDependencies(SimplifyCallGraph &SCG, const Function &F,
+                               DenseSet<const Function *> &Fns) {
+  assert(!F.isDeclaration());
+  SmallVector<const Function *> WorkList({&F});
+
+  while (!WorkList.empty()) {
+    const auto &CurFn = *WorkList.pop_back_val();
+    assert(!CurFn.isDeclaration());
+
+    // Scan for an indirect call. If such a call is found, we have to
+    // conservatively assume this can call all non-entrypoint functions in 
+    // the module.
+    for (auto &SCGNode : *SCG.at(&CurFn)) {
+      auto *Callee = SCGNode->getFunction();
+      if (!Callee || Callee->isDeclaration())
+        continue;
+      if (Callee != &F)
+      {
+        auto [It, Inserted] = Fns.insert(Callee);
+        if (Inserted)
+          WorkList.push_back(Callee);
+      }
+    }
+  }
+}
+
+struct FunctionWithDependencies {
+  FunctionWithDependencies(SimplifyCallGraph &SCG,
+                           const DenseMap<const Function *, CostType> &FnCosts,
+                           const Function *F)
+      : F(F) {
+    addAllDependencies(SCG, *F, Dependencies);
+
+    TotalCost = FnCosts.at(F);
+    for (const auto *Dep : Dependencies) {
+      TotalCost += FnCosts.lookup(Dep);
+    }
+  }
+
+  const Function *F = nullptr;
+  DenseSet<const Function *> Dependencies;
+  CostType TotalCost = 0;
+};
+
 /// Splits the module M into N linkable partitions. The function ModuleCallback
 /// is called N times passing each individual partition as the MPart argument.
 class SplitModuleCG {
@@ -21,12 +185,28 @@ class SplitModuleCG {
                    const llvm::lto::Config &C);
 
   unsigned getPartitionNum() { return N; }
+  StringSet<> &getOriginalExternals() { return OriginalExternals; }
+  StringMap<std::string> &getPromotedRenames() { return PromotedRenames; }
 
-  private:
+private:
   unsigned N;
   Module &M;
   CallGraph CG;
+  std::unique_ptr<SimplifyCallGraph> SCG;
+  CostType ModuleCost;
   DenseSet<const Function *> EntryFuncs;
+  StringSet<> OriginalExternals;
+  StringMap<std::string> PromotedRenames;
+  DenseMap<const Function *, bool> externalFunction;
+  DenseMap<const Function *, CostType> FuncsCosts;
+  SmallVector<FunctionWithDependencies> FWDWorkList;
+
+  void calculateFunctionCosts();
+  std::vector<DenseSet<const Function *>> doPartitioning();
+  void dealWithMpart(
+      Module &MPart, unsigned I,
+      function_ref<bool(const GlobalValue *)> NeedsConservativeImport);
+  void createWorkList();
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 11200ade0e8c0..aa1213e5e6af1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -658,6 +658,16 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
         cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
                                        IRAddStream);
     }
+    
+    // Rename the GlobalValues whose internal is changed to external. That's
+    // can avoid duplicate symbols.
+    auto PromotedRenames = SplitModuleCG.getPromotedRenames();
+    for (auto &GV : MPart->global_values()) {
+      if (auto It = PromotedRenames.find(GV.getName());
+          It != PromotedRenames.end()) {
+        GV.setName(It->second);
+      }
+    }
 
     auto splitStream = [&](unsigned task, const Twine &moduleName)
         -> Expected<std::unique_ptr<CachedFileStream>> {
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index 9f57cb3ed566e..debdddfb79041 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -1,26 +1,381 @@
 #include "llvm/Transforms/Utils/SplitModuleCG.h"
-
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <thread>
 using namespace llvm;
 
 #define DEBUG_TYPE "split-module-CG"
 
+namespace {
+
+static cl::opt<bool> enablePrintSimplifyCallGraph(
+    "enable-print-simplify-callgraph", cl::Hidden, cl::init(false),
+    cl::desc("print SimplifyCallGraph"));
+
+using PartitionID = unsigned;
+
+static void externalize(GlobalValue *GV) {
+  if (GV->hasLocalLinkage()) {
+    GV->setLinkage(GlobalValue::ExternalLinkage);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  }
+
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV->hasName())
+    GV->setName("__llvmsplit_unnamed");
+}
+
+} // namespace
+
+std::vector<DenseSet<const Function *>> SplitModuleCG::doPartitioning() {
+  LLVM_DEBUG(dbgs() << "\n--Partitioning Starts--\n");
+  // Performs all of the partitioning work on M.
+  std::vector<DenseSet<const Function *>> Partitions;
+  Partitions.resize(N);
+  if (N == 0)
+    return Partitions;
+
+  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
+                              const std::pair<PartitionID, CostType> &b) {
+    // When two partitions have the same cost, assign to the one with the
+    // biggest ID first. This allows us to put things in P0 last, because P0 may
+    // have other stuff added later.
+    if (a.second == b.second)
+      return a.first < b.first;
+    return a.second > b.second;
+  };
+
+  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
+  for (unsigned I = 0; I < N; ++I)
+    BalancingQueue.emplace_back(I, 0);
+
+  // Helper function to handle assigning a function to a partition. This takes
+  // care of updating the balancing queue.
+  const auto AssignToPartition = [&](PartitionID PID,
+                                     const FunctionWithDependencies &FWD) {
+    auto &FnsInPart = Partitions[PID];
+    FnsInPart.insert(FWD.F);
+    for (const Function *Dep : FWD.Dependencies) {
+      FnsInPart.insert(Dep);
+    }
+
+    // Update the balancing queue. we scan backwards because in the common case
+    // the partition is at the end.
+    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
+      if (QueuePID == PID) {
+        CostType NewCost = 0;
+        for (auto *Fn : Partitions[PID])
+          NewCost += FuncsCosts.at(Fn);
+        Cost = NewCost;
+      }
+    }
+
+    sort(BalancingQueue, ComparePartitions);
+  };
+
+  for (auto &CurFn : FWDWorkList) {
+    // Normal "load-balancing", assign to partition with least pressure.
+    auto [PID, CurCost] = BalancingQueue.back();
+    AssignToPartition(PID, CurFn);
+  }
+
+  return Partitions;
+}
+
+void SplitModuleCG::calculateFunctionCosts() {
+  ModuleCost = 0;
+  for (auto &Fn : M) {
+    if (Fn.isDeclaration())
+      continue;
+
+    CostType FnCost = 0;
+    for (const auto &BB : Fn) {
+      CostType CostVal = std::distance(BB.begin(), BB.end());
+      FnCost += CostVal;
+    }
+    assert(FnCost != 0);
+    FuncsCosts[&Fn] = FnCost;
+    assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
+    ModuleCost += FnCost;
+  }
+}
+
+void SplitModuleCG::dealWithMpart(Module &MPart, unsigned I,
+                                  function_ref<bool(const GlobalValue *)> NeedsConservativeImport) {
+  // collect symbols to rename
+  auto checkPromoted = [&](const GlobalValue &GV) {
+    // now is external (not local), but not in external set.
+    if (!GV.hasLocalLinkage() && !OriginalExternals.contains(GV.getName())) {
+      if (PromotedRenames.count(GV.getName()))
+        return;
+      MD5 Hash;
+      Hash.update(M.getModuleIdentifier());
+      MD5::MD5Result Result;
+      Hash.final(Result);
+      SmallString<32> HashStr;
+      MD5::stringifyResult(Result, HashStr);
+      std::string NewName = (GV.getName() + "." + HashStr.str().substr(0, 8)).str();
+      PromotedRenames[GV.getName()] = NewName;
+    }
+  };
+
+  auto AvailableExternalizeFunc = [&](llvm::Function &Func) {
+    Func.setLinkage(GlobalValue::AvailableExternallyLinkage);
+    Func.setComdat(nullptr);
+  };
+
+  for (const auto &GV : MPart.global_values())
+    checkPromoted(GV);
+  // Clean-up conservatively imported GVs without any users.
+  for (auto &GV : make_early_inc_range(MPart.globals())) {
+    if (NeedsConservativeImport(&GV) && GV.use_empty())
+      GV.eraseFromParent();
+  }
+
+  for (auto &func : MPart.functions()) {
+    auto Fn = M.getFunction(func.getName());
+    if (externalFunction.count(Fn) && !func.isDeclaration()) {
+      if (!externalFunction[Fn]) {
+        AvailableExternalizeFunc(func);
+      } else {
+        externalFunction[Fn] = false;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << MPart.getModuleIdentifier() << "  : \n");
+  for (auto &F : MPart) {
+    if (!F.isDeclaration())
+      LLVM_DEBUG(dbgs() << "   [Function: ] " << I << "  " << F.getName() << " "
+                        << F.getLinkage() << "\n");
+  }
+}
+
+void SplitModuleCG::createWorkList() {
+  // First, find all the entry functions with an in-degree of 0
+  // (i.e., those that are not called by any function).
+  for (auto &NodePair : *SCG) {
+    SimplifyCallGraphNode *SCGNode = NodePair.second.get();
+    Function *F = SCGNode->getFunction();
+    if (F && SCGNode->getNumReferences() == 0) {
+      EntryFuncs.insert(F);
+    }
+  }
+
+  // Second, find all the dependencies of each entry function.
+  for (auto *F : EntryFuncs) {
+    FWDWorkList.emplace_back(*SCG, FuncsCosts, F);
+  }
+
+  // Third, find all the functions that are not in the worklist.
+  DenseSet<const Function *> SeenFunctions;
+  for (const auto &FWD : FWDWorkList) {
+    SeenFunctions.insert(FWD.F);
+    SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+  }
+  for (auto &F : M) {
+    // This function may be in a ring, and therefore is not a dependency of
+    // any root, which is treated as a root function here.
+    if (!F.isDeclaration() && !SeenFunctions.count(&F)) {
+      FWDWorkList.emplace_back(*SCG, FuncsCosts, &F);
+      auto &FWD = FWDWorkList.back();
+      EntryFuncs.insert(&F);
+      SeenFunctions.insert(FWD.F);
+      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+    }
+  }
+
+  // Sort the worklist so the most expensive roots are seen first.
+  sort(FWDWorkList, [&](auto &A, auto &B) {
+    // Sort by total cost, and if the total cost is identical, sort
+    // alphabetically
+    if (A.TotalCost == B.TotalCost)
+      return A.F->getName() < B.F->getName();
+    return A.TotalCost > B.TotalCost;
+  });
+
+  LLVM_DEBUG(dbgs() << "Number of callgraphs to be allocated: "
+                    << FWDWorkList.size() << "   Module cost: "
+                    << ModuleCost << "\n");
+  LLVM_DEBUG(dbgs() << "callgraphs: \n");
+  for (auto FWD : FWDWorkList) {
+    LLVM_DEBUG(dbgs() << "[root] " << FWD.F->getName() << " (totalCost:"
+                      << FWD.TotalCost << ";   root function cost: "
+                      << FuncsCosts[FWD.F] << ";   has dependency: "
+                      << FWD.Dependencies.size() << "\n");
+  }
+}
+
 void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
                                 const llvm::lto::Config &C) {
-  // TODO: 1. Process the linkage of the GlobalValue; 2. Allocate the callgraph
-  // to N partitions; 3.Invoke the cloneModule API to copy the N partitions to
-  // obtain MParts.
+  for (Function &F : M) {
+    if (F.hasLocalLinkage() && F.hasOneUse() && !F.hasAddressTaken())
+      continue;
+    externalize(&F);
+    if (!F.isDeclaration() &&
+        (F.hasExternalLinkage() || !F.isDefinitionExact()))
+      externalFunction[&F] = true;
+  }
+  for (GlobalVariable &GV : M.globals())
+    externalize(&GV);
+  for (GlobalAlias &GA : M.aliases())
+    externalize(&GA);
+  for (GlobalIFunc &GI : M.ifuncs())
+    externalize(&GI);
 
+  // TODO: Consider optimizing the alias, replacing the determined alias with
+  // the determined aliasee.
+
+  // Assign callgraphs into N partitions.
+  auto Partitions = doPartitioning();
+  assert(Partitions.size() == N);
+
+  // local GVs need to be conservatively imported into [dependency] every module,
+ 	// and then cleaned up afterwards.
+  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
+    // We conservatively import private/internal GVs into every module and clean
+    // them up afterwards.
+    const auto *Var = dyn_cast<GlobalVariable>(GV);
+    return Var && Var->hasLocalLinkage();
+  };
+
+  auto ShouldCloneDefinition = [&](unsigned I, const GlobalValue *GV) {
+    const auto &FnsInPart = Partitions[I];
+
+    // Functions go in their assigned partition.
+    if (const auto *newFn = dyn_cast<Function>(GV)) {
+      const auto *Fn = M.getFunction(newFn->getName());
+      return FnsInPart.contains(Fn);
+    }
+    if (NeedsConservativeImport(GV))
+      return true;
+    // Everything else goes in the first partition.
+    return I == 0;
+  };
+
+  // TODO: In the future, it may be considered to also include clonemodule in
+  // parallel to reduce compilation time.
+  std::vector<std::thread> Threads;
+  Threads.reserve(N);
+  std::vector<std::unique_ptr<Module>> MPartInCtxs;
+  MPartInCtxs.resize(N);
+  for (unsigned I = 0; I < N; ++I) {
+    ValueToValueMapTy VMap;
+    std::unique_ptr<Module> MPart(
+      CloneModule(M, VMap, [&](const GlobalValue *GV) {
+        return ShouldCloneDefinition(I, GV);
+    }));
+
+    dealWithMpart(*MPart, I, NeedsConservativeImport);
+
+    // If not clone module in multi-thread, we also need to clone
+    // the module obtained through segmentation into a new context
+    // to avoid data races.
+    SmallString<0> BC;
+    raw_svector_ostream BCOS(BC);
+    WriteBitcodeToFile(*MPart, BCOS);
+    MPart.reset();
+    Threads.emplace_back([&, I](SmallString<0> BC) {
+      llvm::lto::LTOLLVMContext Ctx(C);
+      Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+          MemoryBufferRef(BC.str(), "ld-temp.o"), Ctx);
+      BC = SmallString<0>();
+      if (!MOrErr)
+        report_fatal_error("Failed to read bitcode");
+      ModuleCallback(std::move(MOrErr.get()), I);
+    }, std::move(BC));
+  }
+  for (auto &T : Threads)
+    T.join();
 }
 
 SplitModuleCG::SplitModuleCG(Module &M,
                              const ModuleSummaryIndex &CombinedIndex,
                              unsigned LimitPartition)
     : M(M), CG(M), N(LimitPartition) {
-  // TODO: The module is split based on the callgraph, and EntryFuncs stores
-  // the root function of each callgraph.
+  // Track existing non-local symbols. This ensures that when we promote
+  // internal symbols to external for partitioning, we can handle renaming
+  // and avoid conflicts.
+  for (const auto &GV : M.global_values())
+    if (!GV.hasLocalLinkage())
+      OriginalExternals.insert(GV.getName());
+
+  calculateFunctionCosts();
+
+  // Construct a simplified call graph to facilitate worklist generation.
+  SCG = std::make_unique<SimplifyCallGraph>(CG, CombinedIndex, M);
+  // TODO: When the SCG is established, the special cases of comdat and
+  // initarray need to be considered.
+
+  // Populate the worklist with root functions and their transitive
+  // dependencies. This worklist serves as the foundation for the
+  // subsequent module partitioning.
+  createWorkList();
 
   if (N == 0 || N > EntryFuncs.size()) {
     N = EntryFuncs.size();
   }
   N = N == 0 ? 1 : N;
 }
+
+void SimplifyCallGraph::createSimplifyCallGraph(
+    const ModuleSummaryIndex &CombinedIndex) {
+  for (auto &NodePair : CG) {
+    CallGraphNode *CGNode = NodePair.second.get();
+    Function *F = CGNode->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    SimplifyCallGraphNode *SCGNode = getOrInsertFunction(F);
+
+    //TODO: Trace indirect call usage for the current function.
+
+    for (const auto &CGNodeItem : *CGNode) {
+      Function *Called = CGNodeItem.second->getFunction();
+      if (!Called) {
+        //TODO: Deal with indirect call. 
+        // 1. Check if the instruction has a callees metadata.
+        // 2. Check if this is an indirect call with profile data.
+        // 3. Check if this is an alias to a function.
+      }
+      if (!Called || Called->isDeclaration())
+        continue;
+      SCGNode->addCalledFunction(getOrInsertFunction(Called));
+    }
+  }
+
+  if (enablePrintSimplifyCallGraph)
+    print();
+}
+
+
+void SimplifyCallGraph::print() {
+  for (auto &SCGItem : FunctionMap) {
+    LLVM_DEBUG(dbgs() << "Call graph node for function: '"
+                      << SCGItem.first->getName() << "' #uses="
+                      << SCGItem.second->getNumReferences() << "\n");
+
+    for (const auto &callee : *SCGItem.second) {
+      LLVM_DEBUG(dbgs() <<"          Calls function : '"
+                        << callee->getFunction()->getName() << " '\n");
+    }
+  }
+}
+
+SimplifyCallGraphNode *
+SimplifyCallGraph::getOrInsertFunction(const Function *F) {
+  auto &SCGN = FunctionMap[F];
+  if (SCGN)
+    return SCGN.get();
+
+  SCGN =
+      std::make_unique<SimplifyCallGraphNode>(this, const_cast<Function *>(F));
+  return SCGN.get();
+}

>From 88db8d4e7fbcadc73e1c48c23bb8781b2c21df4f Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Wed, 20 May 2026 15:57:13 +0800
Subject: [PATCH 3/7] [llvm-split][SplitModuleCG] Add support for SplitModuleCG

Add a new command line option --enable-split-module-CG to llvm-split
tool for testing the SplitModuleCG utility.

The change:
- Adds --enable-split-module-CG flag
- Wire up the SplitModuleCG interface in llvm-split
---
 .../SplitModuleCG/split-promoted-rename.ll    | 41 +++++++++++++++++++
 .../SplitModuleCG/function-with-ring.ll       | 36 ++++++++++++++++
 .../llvm-split/SplitModuleCG/function.ll      | 35 ++++++++++++++++
 .../llvm-split/SplitModuleCG/partition-cap.ll | 10 +++++
 .../SplitModuleCG/single-partition.ll         | 13 ++++++
 .../tools/llvm-split/SplitModuleCG/unnamed.ll |  8 ++++
 llvm/tools/llvm-split/llvm-split.cpp          | 36 ++++++++++++++++
 7 files changed, 179 insertions(+)
 create mode 100644 llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
 create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
 create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/function.ll
 create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
 create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
 create mode 100644 llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll

diff --git a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll b/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
new file mode 100644
index 0000000000000..6c51141a9ad85
--- /dev/null
+++ b/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
@@ -0,0 +1,41 @@
+; Test that internal symbols promoted during module splitting are consistently
+; renamed with an MD5 suffix across all partitions.
+;
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: llvm-lto2 run %t.bc -o %t \
+; RUN:   -thinlto-split=true \
+; RUN:   -thinlto-split-partitions=2 -thinlto-split-module-size-threshold=0 \
+; RUN:   -r=%t.bc,caller_a,px \
+; RUN:   -r=%t.bc,caller_b,px
+; RUN: llvm-nm %t.1 | FileCheck %s
+
+; CHECK-DAG: T caller_a
+; CHECK-DAG: T caller_b
+; CHECK:     T {{.*promoted_internal[._][0-9a-f]+.*}}
+; CHECK-NOT: T promoted_internal{{$}}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; @promoted_internal is internal. SplitModuleCG::dealWithMpart's checkPromoted
+; records it in PromotedRenames. splitOptAndCodeGenThin applies the rename
+; after opt via:
+;   for (auto &GV : MPart->global_values())
+;     if (auto It = PromotedRenames.find(GV.getName()); ...)
+;       GV.setName(It->second);
+define internal void @promoted_internal() {
+entry:
+  ret void
+}
+
+define void @caller_a() {
+entry:
+  call void @promoted_internal()
+  ret void
+}
+
+define void @caller_b() {
+entry:
+  call void @promoted_internal()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll b/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
new file mode 100644
index 0000000000000..f2fc8c03c922a
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/function-with-ring.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+
+; CHECK0-DAG: declare void @foo()
+; CHECK0-DAG: define void @bar()
+; CHECK0-DAG: declare void @call_foo()
+; CHECK0-DAG: define void @call_bar()
+
+; CHECK1-DAG: define void @foo()
+; CHECK1-DAG: declare void @bar()
+; CHECK1-DAG: define void @call_foo()
+; CHECK1-DAG: declare void @call_bar()
+
+define void @foo() {
+entry:
+  call void @call_foo()
+  ret void
+}
+
+define void @bar() {
+entry:
+  ret void
+}
+
+define void @call_foo() {
+entry:
+  call void @foo()
+  ret void
+}
+
+define void @call_bar() {
+entry:
+  call void @bar()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/function.ll b/llvm/test/tools/llvm-split/SplitModuleCG/function.ll
new file mode 100644
index 0000000000000..ddf5bb5c3dff3
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/function.ll
@@ -0,0 +1,35 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+
+; CHECK0-DAG: declare dso_local void @foo()
+; CHECK0-DAG: define void @bar()
+; CHECK0-DAG: declare void @func_a()
+; CHECK0-DAG: define void @func_b()
+; CHECK1-DAG: define internal void @foo()
+; CHECK1-DAG: define available_externally void @bar()
+; CHECK1-DAG: define void @func_a()
+; CHECK1-DAG: declare void @func_b()
+
+define internal void @foo() {
+entry:
+  ret void
+}
+
+define void @bar() {
+entry:
+  ret void
+}
+
+define void @func_a() {
+entry:
+  call void @foo()
+  call void @bar()
+  ret void
+}
+
+define void @func_b() {
+entry:
+  call void @bar()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll b/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
new file mode 100644
index 0000000000000..5c3ced3e682af
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/partition-cap.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-split -enable-split-module-CG=true -j10 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; should only produce 2 output files (N capped to EntryFuncs.size()=2)
+
+; CHECK0: define void @foo()
+; CHECK1: define void @bar()
+
+define void @foo() { ret void }
+define void @bar() { ret void }
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll b/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
new file mode 100644
index 0000000000000..fdfdf910a3498
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/single-partition.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-split -enable-split-module-CG=true -j1 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+
+; CHECK0: define void @foo()
+; CHECK0: define void @bar()
+
+define void @foo() {
+  call void @bar()
+  ret void
+}
+define void @bar() {
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll b/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll
new file mode 100644
index 0000000000000..73f7079669c55
--- /dev/null
+++ b/llvm/test/tools/llvm-split/SplitModuleCG/unnamed.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-split -enable-split-module-CG=true -j2 -o %t %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+
+; CHECK0-DAG: define hidden void @__llvmsplit_unnamed()
+
+define internal void @0() {
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp
index 4cc4fd945fc53..4156222855617 100644
--- a/llvm/tools/llvm-split/llvm-split.cpp
+++ b/llvm/tools/llvm-split/llvm-split.cpp
@@ -18,8 +18,10 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
+#include "llvm/LTO/Config.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -35,6 +37,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 #include "llvm/Transforms/Utils/SplitModuleByCategory.h"
+#include "llvm/Transforms/Utils/SplitModuleCG.h"
 
 using namespace llvm;
 
@@ -76,6 +79,10 @@ static cl::opt<std::string>
 static cl::opt<std::string>
     MCPU("mcpu", cl::desc("Target CPU, ignored if --mtriple is not used"),
          cl::value_desc("cpu"), cl::cat(SplitCategory));
+         
+static cl::opt<bool>
+    EnableSplitModuleCG("enable-split-module-CG", cl::Prefix, cl::init(false),
+     cl::desc("Split module using call graph"), cl::cat(SplitCategory));
 
 enum class SplitByCategoryType {
   SBCT_ByAttribute,
@@ -327,6 +334,35 @@ int main(int argc, char **argv) {
               "splitModule implementation\n";
   }
 
+  if (EnableSplitModuleCG) {
+    const auto HandleModulePartCG = [&](std::unique_ptr<Module> MPart, unsigned I) {
+      std::error_code EC;
+      std::unique_ptr<ToolOutputFile> Out(
+          new ToolOutputFile(OutputFilename + utostr(I), EC, sys::fs::OF_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
+        exit(1);
+      }
+
+      if (verifyModule(*MPart, &errs())) {
+        errs() << "Broken module!\n";
+        exit(1);
+      }
+
+      WriteBitcodeToFile(*MPart, Out->os());
+
+      // Declare success.
+      Out->keep();
+    };
+
+    llvm::lto::Config Config;
+    ModuleSummaryIndex CombinedIndex(false);
+    SplitModuleCG SplitModuleCG(*M, CombinedIndex, NumOutputs);
+    SplitModuleCG.SplitModule(HandleModulePartCG, Config);
+    return 0;
+  }
+
   SplitModule(*M, NumOutputs, HandleModulePart, PreserveLocals, RoundRobin);
   return 0;
 }
+

>From 065dd31ca935fdb3b36a6272f2b4095e64195fd2 Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Fri, 29 May 2026 14:56:19 +0800
Subject: [PATCH 4/7] [Thinlto][SplitModuleCG] Fix Windows compile of closeFile

Remove unused 'MergedFD', sys::fs::OpenFlags Flags) without "ResultDF".
---
 llvm/lib/LTO/LTOBackend.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index aa1213e5e6af1..ef78f1fa8ac3a 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -705,13 +705,10 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
   if (!FinalStream)
     report_fatal_error("Failed to open final output stream");
 
-  int MergedFD;
   SmallString<128> MergedFilename;
-  if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFD,
-                                   MergedFilename))
+  if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFilename))
     report_fatal_error("Failed to create merged temp file.");
   llvm::FileRemover MergedFileRemover(MergedFilename);
-  sys::fs::closeFile(MergedFD);
 
   std::vector<StringRef> Args;
   std::string LinkerPath = "";

>From 5334e7269d4c9fd00c5a3e31c213b84feeae480d Mon Sep 17 00:00:00 2001
From: JiangNing <jiangninghx at foxmail.com>
Date: Sat, 23 May 2026 17:30:55 +0800
Subject: [PATCH 5/7] [ThinLTO][Driver] Move split-codegen linker invocation
 out of LTOBackend

This patch fixes a layering violation by moving the partition merging
step (`ld -r`) out of `LTOBackend` and into the Clang Driver.

- `BackendUtil` now outputs an `@rsp` file containing the split partitions.
- `ThinLTOMergeJobAction` is added to the Driver to invoke `ld.lld -r` using the `@rsp` file.
- `AcceptsMultipleOutputsPerTask` is added to `lto::Config` to protect unaware LTO clients from data races.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 +
 clang/include/clang/Driver/Action.h           |  14 +-
 clang/include/clang/Driver/CommonArgs.h       |  13 +
 clang/include/clang/Driver/Job.h              |  26 ++
 clang/include/clang/Driver/Tool.h             |   1 +
 clang/include/clang/Driver/ToolChain.h        |   2 +
 .../include/clang/Frontend/FrontendOptions.h  |   3 +
 clang/include/clang/Options/Options.td        |   3 +
 clang/lib/CodeGen/BackendUtil.cpp             | 108 +++++++-
 clang/lib/Driver/Action.cpp                   |   7 +
 clang/lib/Driver/Driver.cpp                   |  24 +-
 clang/lib/Driver/Job.cpp                      |  44 +++
 clang/lib/Driver/ToolChain.cpp                |  15 ++
 clang/lib/Driver/ToolChains/Clang.cpp         |  11 +
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  32 +++
 clang/lib/Driver/ToolChains/Gnu.cpp           |  41 +++
 clang/lib/Driver/ToolChains/Gnu.h             |   1 +
 .../thinlto-split/split-output-list-dwo.ll    | 169 ++++++++++++
 .../thinlto-split/split-output-list.ll        |  75 ++++++
 .../thinlto-split}/split-promoted-rename.ll   |  24 +-
 .../thinlto-split-merge-realistic.ll          | 251 ++++++++++++++++++
 clang/test/Driver/thinlto-split-merge.c       |  64 +++++
 lld/ELF/LTO.cpp                               |  18 +-
 llvm/include/llvm/LTO/Config.h                |  17 ++
 llvm/include/llvm/LTO/LTO.h                   |   5 +
 llvm/include/llvm/LTO/LTOBackend.h            |   8 +
 llvm/lib/LTO/LTO.cpp                          |  81 +++++-
 llvm/lib/LTO/LTOBackend.cpp                   | 219 ++++++---------
 llvm/lib/Transforms/Utils/SplitModuleCG.cpp   |   6 +
 29 files changed, 1124 insertions(+), 160 deletions(-)
 create mode 100644 clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
 create mode 100644 clang/test/CodeGen/thinlto-split/split-output-list.ll
 rename {llvm/test/Transforms/SplitModuleCG => clang/test/CodeGen/thinlto-split}/split-promoted-rename.ll (58%)
 create mode 100644 clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
 create mode 100644 clang/test/Driver/thinlto-split-merge.c

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 020014dabacfd..1d2f7e5832a01 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -181,6 +181,8 @@ def warn_openmp_spec_incomplete : Warning<
   InGroup<ExperimentalOption>;
 def err_drv_invalid_thread_model_for_target : Error<
   "invalid thread model '%0' in '%1' for this target">;
+def err_drv_lto_split_requires_lld : Error<
+  "cannot find 'ld.lld' required for ThinLTO split codegen at '%0'">;
 def err_drv_invalid_linker_name : Error<
   "invalid linker name in argument '%0'">;
 def err_drv_invalid_rtlib_name : Error<
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 67937b00f6bcf..b4e789bf15102 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -77,9 +77,10 @@ class Action {
     BinaryAnalyzeJobClass,
     BinaryTranslatorJobClass,
     ObjcopyJobClass,
+    ThinLTOMergeJobClass,
 
     JobClassFirst = PreprocessJobClass,
-    JobClassLast = ObjcopyJobClass
+    JobClassLast = ThinLTOMergeJobClass
   };
 
   // The offloading kind determines if this action is binded to a particular
@@ -519,6 +520,17 @@ class LinkJobAction : public JobAction {
   }
 };
 
+class ThinLTOMergeJobAction : public JobAction {
+  void anchor() override;
+
+public:
+  ThinLTOMergeJobAction(ActionList &Inputs, types::ID Type);
+
+  static bool classof(const Action *A) {
+    return A->getKind() == ThinLTOMergeJobClass;
+  }
+};
+
 class LipoJobAction : public JobAction {
   void anchor() override;
 
diff --git a/clang/include/clang/Driver/CommonArgs.h b/clang/include/clang/Driver/CommonArgs.h
index 0af1b89425227..e69a54b54eefe 100644
--- a/clang/include/clang/Driver/CommonArgs.h
+++ b/clang/include/clang/Driver/CommonArgs.h
@@ -158,6 +158,19 @@ bool isObjCAutoRefCount(const llvm::opt::ArgList &Args);
 llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args,
                                   const Driver &D);
 
+bool isThinLTOSplitEnabled(const llvm::opt::ArgList &Args);
+
+/// Response-file path listing the partition objects for cc1 output \p Output.
+/// Written by cc1 (-thinlto-split-output-list) and read by ThinLTOMergeJobAction
+/// (`ld.lld -r @<file>`); shared so both agree on the name.
+std::string getThinLTOSplitResponseFile(llvm::StringRef Output);
+
+/// Single gating predicate (shared by the cc1 flag and the merge action) for
+/// whether the driver splits a distributed ThinLTO compile and merges it with
+/// `ld.lld -r`.
+bool isThinLTOSplitMergeEnabled(const ToolChain &TC,
+                                const llvm::opt::ArgList &Args);
+
 bool areOptimizationsEnabled(const llvm::opt::ArgList &Args);
 
 bool isUseSeparateSections(const llvm::Triple &Triple);
diff --git a/clang/include/clang/Driver/Job.h b/clang/include/clang/Driver/Job.h
index 116254f79ae6f..b8e550fc19968 100644
--- a/clang/include/clang/Driver/Job.h
+++ b/clang/include/clang/Driver/Job.h
@@ -264,6 +264,32 @@ class CC1Command : public Command {
   void setEnvironment(llvm::ArrayRef<const char *> NewEnvironment) override;
 };
 
+/// Merges the per-partition objects from ThinLTO split codegen into one
+/// relocatable object (`ld.lld -r`). The partition count is only known at
+/// codegen time, so after the merge this reads the response file to remove
+/// them (unless -save-temps); the response file is a normal Compilation temp.
+class ThinLTOMergeCommand : public Command {
+  /// Response file listing the partition objects to merge.
+  std::string SplitOutputList;
+
+  /// Whether to remove the partition objects after a successful merge (false
+  /// under -save-temps).
+  bool CleanupSplitOutputs;
+
+  void cleanupSplitOutputs() const;
+
+public:
+  ThinLTOMergeCommand(const Action &Source, const Tool &Creator,
+                      ResponseFileSupport ResponseSupport,
+                      const char *Executable,
+                      const llvm::opt::ArgStringList &Arguments,
+                      ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs,
+                      StringRef SplitOutputList, bool CleanupSplitOutputs);
+
+  int Execute(ArrayRef<std::optional<StringRef>> Redirects, std::string *ErrMsg,
+              bool *ExecutionFailed) const override;
+};
+
 /// JobList - A sequence of jobs to perform.
 class JobList {
 public:
diff --git a/clang/include/clang/Driver/Tool.h b/clang/include/clang/Driver/Tool.h
index 42cf99a4a9703..e09583c13f42d 100644
--- a/clang/include/clang/Driver/Tool.h
+++ b/clang/include/clang/Driver/Tool.h
@@ -56,6 +56,7 @@ class Tool {
   virtual bool canEmitIR() const { return false; }
   virtual bool hasIntegratedCPP() const = 0;
   virtual bool isLinkJob() const { return false; }
+  virtual bool canConstructThinLTOMergeJob() const { return false; }
   virtual bool isDsymutilJob() const { return false; }
 
   /// Does this tool have "good" standardized diagnostics, or should the
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 684ef52d8532b..048ae490f05e9 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -429,6 +429,8 @@ class ToolChain {
   /// a compiler other than Clang.
   virtual Tool *SelectTool(const JobAction &JA) const;
 
+  bool canConstructThinLTOMergeJob() const;
+
   // Helper methods
 
   std::string GetFilePath(const char *Name) const;
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 2f75fba566dfb..4fb2c41bedb1e 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -445,6 +445,9 @@ class FrontendOptions {
   /// The output file, if any.
   std::string OutputFile;
 
+  /// Response file listing objects emitted by ThinLTO split codegen.
+  std::string ThinLTOSplitOutputList;
+
   /// If given, the new suffix for fix-it rewritten files.
   std::string FixItSuffix;
 
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 6fc8806ba683c..88250c5944954 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -3413,6 +3413,9 @@ def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
 def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
   Visibility<[ClangOption, CLOption, CC1Option]>, Group<f_Group>,
   HelpText<"Perform ThinLTO importing using provided function summary index">;
+def thinlto_split_output_list_EQ : Joined<["-"], "thinlto-split-output-list=">,
+  Visibility<[CC1Option]>, Flags<[HelpHidden]>,
+  MarshallingInfoString<FrontendOpts<"ThinLTOSplitOutputList">>;
 def fthin_link_bitcode_EQ : Joined<["-"], "fthin-link-bitcode=">,
   Visibility<[ClangOption, CLOption, CC1Option]>, Group<f_Group>,
   HelpText<"Write minimized bitcode to <file> for the ThinLTO thin link only">,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index a46a25c4492f2..e198f08882804 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -17,6 +17,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/RuntimeLibcallInfo.h"
@@ -54,6 +55,7 @@
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -92,7 +94,9 @@
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <limits>
+#include <map>
 #include <memory>
+#include <mutex>
 #include <optional>
 using namespace clang;
 using namespace llvm;
@@ -1307,6 +1311,29 @@ void EmitAssemblyHelper::emitAssembly(BackendAction Action,
     DwoOS->keep();
 }
 
+static std::string getThinLTOSplitOutputFile(const FrontendOptions &Opts,
+                                             size_t Task) {
+  return (Twine(Opts.OutputFile) + ".thinlto-split." + Twine(Task) + ".o")
+      .str();
+}
+
+static bool writeThinLTOSplitOutputList(DiagnosticsEngine &Diags,
+                                        StringRef OutputList,
+                                        ArrayRef<std::string> Outputs) {
+  std::error_code EC;
+  raw_fd_ostream OS(OutputList, EC, sys::fs::OF_Text);
+  if (EC) {
+    Diags.Report(diag::err_fe_unable_to_open_output)
+        << OutputList << EC.message();
+    return false;
+  }
+  for (StringRef Output : Outputs) {
+    sys::printArg(OS, Output, /*Quote=*/true);
+    OS << '\n';
+  }
+  return true;
+}
+
 static void
 runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
                   llvm::Module *M, std::unique_ptr<raw_pwrite_stream> OS,
@@ -1329,11 +1356,35 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
   if (!lto::initImportList(*M, *CombinedIndex, ImportList))
     return;
 
-  auto AddStream = [&](size_t Task, const Twine &ModuleName) {
+  const std::string &SplitOutputList =
+      CI.getFrontendOpts().ThinLTOSplitOutputList;
+  std::map<size_t, std::string> SplitOutputMap;
+  std::mutex SplitOutputFilesMutex;
+
+  auto AddStream = [&](size_t Task, const Twine &/*ModuleName*/)
+      -> Expected<std::unique_ptr<CachedFileStream>> {
+    if (!SplitOutputList.empty()) {
+      std::unique_ptr<raw_pwrite_stream> OutputOS;
+      std::string OutputPath =
+          getThinLTOSplitOutputFile(CI.getFrontendOpts(), Task);
+      {
+        std::lock_guard<std::mutex> Lock(SplitOutputFilesMutex);
+        SplitOutputMap[Task] = OutputPath;
+      }
+
+      std::error_code EC;
+      OutputOS =
+          std::make_unique<raw_fd_ostream>(OutputPath, EC, sys::fs::OF_None);
+      if (EC)
+        return errorCodeToError(EC);
+      return std::make_unique<CachedFileStream>(std::move(OutputOS),
+                                                OutputPath);
+    }
     return std::make_unique<CachedFileStream>(std::move(OS),
                                               CGOpts.ObjectFilenameForDebug);
   };
   lto::Config Conf;
+  Conf.AcceptsMultipleOutputsPerTask = !SplitOutputList.empty();
   if (CGOpts.SaveTempsFilePrefix != "") {
     if (Error E = Conf.addSaveTemps(CGOpts.SaveTempsFilePrefix + ".",
                                     /* UseInputModulePath */ false)) {
@@ -1384,6 +1435,16 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
   Conf.RemarksFormat = CGOpts.OptRecordFormat;
   Conf.SplitDwarfFile = CGOpts.SplitDwarfFile;
   Conf.SplitDwarfOutput = CGOpts.SplitDwarfOutput;
+  // Split partitions need distinct .dwo files for both split and single
+  // fission modes.
+  if (!SplitOutputList.empty() && !CGOpts.SplitDwarfFile.empty()) {
+    SmallString<128> DwoStem(CGOpts.SplitDwarfOutput.empty()
+                                ? CGOpts.SplitDwarfFile
+                                : CGOpts.SplitDwarfOutput);
+    if (llvm::sys::path::extension(DwoStem) == ".dwo")
+      llvm::sys::path::replace_extension(DwoStem, "");
+    Conf.SplitDwarfOutputStem = std::string(DwoStem);
+  }
   for (auto &Plugin : CI.getPassPlugins())
     Conf.LoadedPassPlugins.push_back(Plugin.get());
   switch (Action) {
@@ -1423,6 +1484,14 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
   }
+
+  if (!SplitOutputList.empty()) {
+    SmallVector<std::string, 0> Outputs;
+    for (const auto &Pair : SplitOutputMap)
+      Outputs.push_back(Pair.second);
+    if (!writeThinLTOSplitOutputList(Diags, SplitOutputList, Outputs))
+      return;
+  }
 }
 
 void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
@@ -1434,6 +1503,14 @@ void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
   llvm::TimeTraceScope TimeScope("Backend");
   DiagnosticsEngine &Diags = CI.getDiagnostics();
 
+  // When split codegen is active, cc1's -o stream should never receive
+  // partition content. The driver owns the final -o via ld.lld -r, and each
+  // partition is written to its own .thinlto-split.N.o file by the AddStream
+  // callback in runThinLTOBackend. Replace the original OS with a null stream
+  // to avoid opening or writing the user-specified -o path at all.
+  if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty())
+    OS = std::make_unique<raw_null_ostream>();
+
   std::unique_ptr<llvm::Module> EmptyModule;
   if (!CGOpts.ThinLTOIndexFile.empty()) {
     // FIXME(sandboxing): Figure out how to support distributed indexing.
@@ -1474,9 +1551,38 @@ void clang::emitBackendOutput(CompilerInstance &CI, CodeGenOptions &CGOpts,
     }
   }
 
+  // When split codegen is active, the skip/fallback path must emit its object
+  // to the partition-0 filename instead of the original -o. The original OS
+  // was replaced with a null stream above.
+  if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty()) {
+    std::string FallbackOutputPath =
+        getThinLTOSplitOutputFile(CI.getFrontendOpts(), 0);
+    std::error_code EC;
+    auto FallbackOS = std::make_unique<raw_fd_ostream>(FallbackOutputPath, EC,
+                                                       sys::fs::OF_None);
+    if (EC) {
+      Diags.Report(diag::err_fe_unable_to_open_output)
+          << FallbackOutputPath << EC.message();
+      return;
+    }
+    OS = std::move(FallbackOS);
+  }
+
   EmitAssemblyHelper AsmHelper(CI, CGOpts, M, VFS);
   AsmHelper.emitAssembly(Action, std::move(OS), BC);
 
+  if (!CI.getFrontendOpts().ThinLTOSplitOutputList.empty()) {
+    // If distributed ThinLTO indexing skips this backend, runThinLTOBackend is
+    // bypassed. Keep the driver merge action valid by listing the object
+    // emitted above. The real split path writes only partition objects.
+    std::string FallbackOutputPath =
+        getThinLTOSplitOutputFile(CI.getFrontendOpts(), 0);
+    if (!writeThinLTOSplitOutputList(
+            Diags, CI.getFrontendOpts().ThinLTOSplitOutputList,
+            ArrayRef<std::string>(&FallbackOutputPath, 1)))
+      return;
+  }
+
   // Verify clang's TargetInfo DataLayout against the LLVM TargetMachine's
   // DataLayout.
   if (AsmHelper.TM) {
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 72a42a6f957ee..0d036c34f2be4 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -54,6 +54,8 @@ const char *Action::getClassName(ActionClass AC) {
     return "binary-translator";
   case ObjcopyJobClass:
     return "objcopy";
+  case ThinLTOMergeJobClass:
+    return "thinlto-merger";
   }
 
   llvm_unreachable("invalid class");
@@ -402,6 +404,11 @@ void LinkJobAction::anchor() {}
 LinkJobAction::LinkJobAction(ActionList &Inputs, types::ID Type)
     : JobAction(LinkJobClass, Inputs, Type) {}
 
+void ThinLTOMergeJobAction::anchor() {}
+
+ThinLTOMergeJobAction::ThinLTOMergeJobAction(ActionList &Inputs, types::ID Type)
+    : JobAction(ThinLTOMergeJobClass, Inputs, Type) {}
+
 void LipoJobAction::anchor() {}
 
 LipoJobAction::LipoJobAction(ActionList &Inputs, types::ID Type)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 4a968a4ce5cc0..139f508cfc1a9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -59,6 +59,7 @@
 #include "clang/Basic/Version.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Action.h"
+#include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
@@ -4623,8 +4624,18 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     }
 
     // If we ended with something, add to the output list.
-    if (Current)
+    if (Current) {
+      // ThinLTO split codegen has cc1 emit one object per partition; append a
+      // merge action (`ld.lld -r`) to recombine them. Gating must match the cc1
+      // side in Clang::ConstructJob; both use isThinLTOSplitMergeEnabled.
+      if (Current->getType() == types::TY_Object &&
+          tools::isThinLTOSplitMergeEnabled(C.getDefaultToolChain(), Args)) {
+        ActionList Inputs;
+        Inputs.push_back(Current);
+        Current = C.MakeAction<ThinLTOMergeJobAction>(Inputs, types::TY_Object);
+      }
       Actions.push_back(Current);
+    }
 
     // Add any top level actions generated for offloading.
     if (!UseNewOffloadingDriver)
@@ -5721,7 +5732,16 @@ class ToolSelector final {
 
   /// Return true if an assemble action can be collapsed.
   bool canCollapseAssembleAction() const {
-    return TC.useIntegratedAs() && !SaveTemps &&
+    // ThinLTO split codegen requires multiple native object outputs per task
+    // (AcceptsMultipleOutputsPerTask), which the assembly emission path (-S)
+    // cannot provide. When -save-temps would normally prevent collapsing the
+    // assemble step, still collapse it for the ELF ThinLTO split backend case
+    // so cc1 emits objects directly rather than going through assembly.
+    bool SaveTempsBlock = SaveTemps &&
+      !(C.getArgs().hasArg(options::OPT_fthinlto_index_EQ) &&
+        TC.getTriple().isOSBinFormatELF() &&
+        tools::isThinLTOSplitEnabled(C.getArgs()));
+    return TC.useIntegratedAs() && !SaveTempsBlock &&
            !C.getArgs().hasArg(options::OPT_via_file_asm) &&
            !C.getArgs().hasArg(options::OPT__SLASH_FA) &&
            !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
diff --git a/clang/lib/Driver/Job.cpp b/clang/lib/Driver/Job.cpp
index da7a1f2e07e90..7af950764054b 100644
--- a/clang/lib/Driver/Job.cpp
+++ b/clang/lib/Driver/Job.cpp
@@ -19,9 +19,12 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/IOSandbox.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Program.h"
@@ -452,6 +455,47 @@ void CC1Command::setEnvironment(llvm::ArrayRef<const char *> NewEnvironment) {
       "The CC1Command doesn't support changing the environment vars!");
 }
 
+ThinLTOMergeCommand::ThinLTOMergeCommand(
+    const Action &Source, const Tool &Creator,
+    ResponseFileSupport ResponseSupport, const char *Executable,
+    const llvm::opt::ArgStringList &Arguments, ArrayRef<InputInfo> Inputs,
+    ArrayRef<InputInfo> Outputs, StringRef SplitOutputList,
+    bool CleanupSplitOutputs)
+    : Command(Source, Creator, ResponseSupport, Executable, Arguments, Inputs,
+              Outputs),
+      SplitOutputList(SplitOutputList),
+      CleanupSplitOutputs(CleanupSplitOutputs) {}
+
+void ThinLTOMergeCommand::cleanupSplitOutputs() const {
+  // Remove the partition objects listed in the response file. Per-partition
+  // .dwo files (split DWARF) are deliberately kept: they are final debug output
+  // referenced by the merged object's skeleton CUs.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MBOrErr =
+      llvm::MemoryBuffer::getFile(SplitOutputList, /*IsText=*/true,
+                                  /*RequiresNullTerminator=*/false);
+  if (!MBOrErr)
+    return;
+
+  llvm::BumpPtrAllocator Alloc;
+  llvm::StringSaver Saver(Alloc);
+  SmallVector<const char *, 16> OutputFiles;
+  llvm::cl::TokenizeGNUCommandLine((*MBOrErr)->getBuffer(), Saver, OutputFiles);
+  for (const char *OutputFile : OutputFiles)
+    llvm::sys::fs::remove(OutputFile);
+}
+
+int ThinLTOMergeCommand::Execute(ArrayRef<std::optional<StringRef>> Redirects,
+                                 std::string *ErrMsg,
+                                 bool *ExecutionFailed) const {
+  int Res = Command::Execute(Redirects, ErrMsg, ExecutionFailed);
+  // Clean up the partition inputs only on full success; keep them on any
+  // failure so the failing `ld.lld -r` can be re-run or inspected.
+  bool Launched = !ExecutionFailed || !*ExecutionFailed;
+  if (CleanupSplitOutputs && Launched && Res == 0)
+    cleanupSplitOutputs();
+  return Res;
+}
+
 void JobList::Print(raw_ostream &OS, const char *Terminator, bool Quote,
                     CrashReportInfo *CrashInfo) const {
   for (const auto &Job : *this)
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 511eb3757456b..583944b193b11 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -777,6 +777,7 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
   case Action::BinaryAnalyzeJobClass:
   case Action::BinaryTranslatorJobClass:
   case Action::ObjcopyJobClass:
+  case Action::ThinLTOMergeJobClass:
     llvm_unreachable("Invalid tool kind.");
 
   case Action::CompileJobClass:
@@ -1222,9 +1223,23 @@ Tool *ToolChain::SelectTool(const JobAction &JA) const {
   if (AC == Action::AssembleJobClass && useIntegratedAs() &&
       !getTriple().isOSAIX())
     return getClangAs();
+  // ThinLTOMergeJobAction is only generated for ELF targets (see
+  // Driver::BuildActions). Assert here to catch accidental routing to
+  // non-ELF toolchains that do not implement the merge logic.
+  if (AC == Action::ThinLTOMergeJobClass) {
+    assert(getTriple().isOSBinFormatELF() &&
+           "ThinLTOMergeJobAction should only be generated for ELF targets");
+    return getLink();
+  }
   return getTool(AC);
 }
 
+bool ToolChain::canConstructThinLTOMergeJob() const {
+  ActionList Inputs;
+  LinkJobAction JA(Inputs, types::TY_Image);
+  return SelectTool(JA)->canConstructThinLTOMergeJob();
+}
+
 std::string ToolChain::GetFilePath(const char *Name) const {
   return D.GetFilePath(Name, *this);
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 05e1f6db80a11..a10cce907ce47 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5399,6 +5399,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     if (!types::isLLVMIR(Input.getType()))
       D.Diag(diag::err_drv_arg_requires_bitcode_input) << A->getAsString(Args);
     Args.AddLastArg(CmdArgs, options::OPT_fthinlto_index_EQ);
+
+    // For ThinLTO split codegen, have cc1 write each call-graph partition to a
+    // separate object listed in a response file; the driver's
+    // ThinLTOMergeJobAction merges them with `ld.lld -r`. Gate on the default
+    // toolchain (as Driver::BuildActions does) so this flag and the merge action
+    // stay in lockstep. See isThinLTOSplitMergeEnabled.
+    if (Output.isFilename() && Output.getType() == types::TY_Object &&
+        isThinLTOSplitMergeEnabled(C.getDefaultToolChain(), Args))
+      CmdArgs.push_back(Args.MakeArgString(
+          Twine("-thinlto-split-output-list=") +
+          getThinLTOSplitResponseFile(Output.getFilename())));
   }
 
   if (Triple.isPPC())
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6f0ac7d5159c1..e52d8212fd496 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -941,6 +941,38 @@ llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
   return LtoJobsArg->getValue();
 }
 
+bool tools::isThinLTOSplitEnabled(const ArgList &Args) {
+  std::optional<bool> Enabled;
+  for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
+    for (size_t I = 0, E = A->getNumValues(); I != E; ++I) {
+      StringRef V = A->getValue(I);
+      if (V == "-thinlto-split" || V == "-thinlto-split=true")
+        Enabled = true;
+      else if (V == "-thinlto-split=false")
+        Enabled = false;
+    }
+  }
+  return Enabled.value_or(false);
+}
+
+std::string tools::getThinLTOSplitResponseFile(StringRef Output) {
+  return (Twine(Output) + ".thinlto-split.rsp").str();
+}
+
+bool tools::isThinLTOSplitMergeEnabled(const ToolChain &TC,
+                                       const ArgList &Args) {
+  // Driver-mediated split applies to the distributed backend compile only: cc1
+  // (-fthinlto-index) emits one object per partition and the driver merges them
+  // with `ld.lld -r`. Gated on -c because the merged object must be the final
+  // output; without -c the object feeds a link step and there is no merge action
+  // (link-time splitting is then handled inside the linker itself). This is the
+  // single gating predicate shared by the cc1 flag and the merge action.
+  return isThinLTOSplitEnabled(Args) &&
+         Args.hasArg(options::OPT_fthinlto_index_EQ) &&
+         Args.hasArg(options::OPT_c) && TC.getTriple().isOSBinFormatELF() &&
+         TC.canConstructThinLTOMergeJob();
+}
+
 // PS4/PS5 uses -ffunction-sections and -fdata-sections by default.
 bool tools::isUseSeparateSections(const llvm::Triple &Triple) {
   return Triple.isPS();
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 131dd725c7289..72c2e93f9f824 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
@@ -280,6 +281,46 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const auto &ToolChain = static_cast<const Generic_ELF &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
 
+  // ThinLTOMergeJobAction is ELF-only. See ToolChain::SelectTool for the
+  // routing assertion and Driver::BuildActions for the ELF pre-condition.
+  if (isa<ThinLTOMergeJobAction>(JA)) {
+    ArgStringList CmdArgs;
+    const char *BaseInput = nullptr;
+    for (const auto &II : Inputs) {
+      if (II.isFilename()) {
+        BaseInput = II.getFilename();
+        break;
+      }
+    }
+    assert(BaseInput && "ThinLTO merge job requires an input file");
+
+    // Response file cc1 wrote the partition objects to (shared helper keeps the
+    // name in sync with Clang::ConstructJob). Register it as a Compilation
+    // temporary so it is cleaned up normally (honoring -save-temps).
+    const char *ResponseFile =
+        Args.MakeArgString(tools::getThinLTOSplitResponseFile(BaseInput));
+    C.addTempFile(ResponseFile);
+
+    CmdArgs.push_back("-r");
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(Output.getFilename());
+    CmdArgs.push_back(Args.MakeArgString(Twine("@") + ResponseFile));
+
+    // Use clang's normal program lookup so -B and configured program paths can
+    // select the matching ld.lld.
+    std::string LLDPath = ToolChain.GetProgramPath("ld.lld");
+    if (!llvm::sys::fs::can_execute(LLDPath)) {
+      D.Diag(clang::diag::err_drv_lto_split_requires_lld) << LLDPath;
+      return;
+    }
+    const char *Exec = Args.MakeArgString(LLDPath);
+
+    C.addCommand(std::make_unique<ThinLTOMergeCommand>(
+        JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs,
+        Output, ResponseFile, !D.isSaveTempsEnabled()));
+    return;
+  }
+
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
diff --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index 5fe143b4aa035..5aaccebe1cfde 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -58,6 +58,7 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 
   bool hasIntegratedCPP() const override { return false; }
   bool isLinkJob() const override { return true; }
+  bool canConstructThinLTOMergeJob() const override { return true; }
 
   void ConstructJob(Compilation &C, const JobAction &JA,
                     const InputInfo &Output, const InputInfoList &Inputs,
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
new file mode 100644
index 0000000000000..e3064684e542a
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
@@ -0,0 +1,169 @@
+; REQUIRES: aarch64-registered-target
+
+; End-to-end test for ThinLTO split codegen + gsplit-dwarf.
+;
+; Verifies that when -gsplit-dwarf is combined with ThinLTO split codegen:
+;   1. Each partition generates its own .dwo file named <stem>.thinlto-split.<Task>.dwo
+;   2. .o and .dwo partition numbering is consistent
+;   3. Each partition's skeleton CU references the correct per-partition .dwo via DW_AT_dwo_name
+;   4. DWO_id in the skeleton CU matches the compile unit in the corresponding .dwo
+;   5. lld -r merge produces a final output.o whose skeleton CUs still reference valid .dwo files
+;   6. No 4294967295.dwo is generated
+;   7. No shared single .dwo file is generated (no output.dwo without partition suffix)
+;   8. Both -gsplit-dwarf=split and -gsplit-dwarf=single produce per-partition .dwo
+
+; --- Step 1: Generate ThinLTO bitcode with debug info ---
+; The IR module must have !dbg metadata to produce DWARF output.
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t.o %s
+
+; --- Step 2: Generate distributed ThinLTO index ---
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t.index \
+; RUN:   -r=%t.o,caller_a,px \
+; RUN:   -r=%t.o,caller_b,px
+
+; --- Step 3: -gsplit-dwarf=split path (both -split-dwarf-file and -split-dwarf-output) ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -thinlto-split-output-list=%t.split.rsp \
+; RUN:   -split-dwarf-file %t.split.o.dwo \
+; RUN:   -split-dwarf-output %t.split.o.dwo \
+; RUN:   -o %t.split.o -x ir %t.o \
+; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; --- Step 3a: Verify per-partition .dwo files exist ---
+; RUN: ls %t.split.o.thinlto-split.0.dwo
+; RUN: ls %t.split.o.thinlto-split.1.dwo
+
+; --- Step 3b: Verify NO single shared .dwo ---
+; RUN: not ls %t.split.o.dwo 2>/dev/null
+
+; --- Step 3c: Verify no 4294967295.dwo ---
+; RUN: not ls %t.split.o.thinlto-split.4294967295.dwo 2>/dev/null
+
+; --- Step 3d: Verify .dwo files are valid ELF ---
+; RUN: llvm-readobj -h %t.split.o.thinlto-split.0.dwo | FileCheck %s --check-prefix=DWO0-ELF
+; DWO0-ELF: Type: Relocatable
+
+; RUN: llvm-readobj -h %t.split.o.thinlto-split.1.dwo | FileCheck %s --check-prefix=DWO1-ELF
+; DWO1-ELF: Type: Relocatable
+
+; --- Step 3e: Verify skeleton CU DW_AT_dwo_name ---
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=SKELETON0
+; SKELETON0: DW_TAG_skeleton_unit
+; SKELETON0: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=SKELETON1
+; SKELETON1: DW_TAG_skeleton_unit
+; SKELETON1: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 3f: Verify DWO_id is present in both skeleton CU and .dwo ---
+; FileCheck variables cannot span separate RUN lines, so we verify presence
+; and format rather than exact value equality across files. End-to-end shell
+; testing confirms DWO_id values match between skeleton CU and .dwo.
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=DWO_ID0
+; DWO_ID0: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.0.dwo | FileCheck %s --check-prefix=DWO_DWO0
+; DWO_DWO0: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=DWO_ID1
+; DWO_ID1: DWO_id = 0x{{[0-9a-f]+}}
+
+; RUN: llvm-dwarfdump -debug-info %t.split.o.thinlto-split.1.dwo | FileCheck %s --check-prefix=DWO_DWO1
+; DWO_DWO1: DWO_id = 0x{{[0-9a-f]+}}
+
+; --- Step 4: Verify lld -r merge preserves .dwo references ---
+; RUN: ld.lld -r -o %t.merged.o %t.split.o.thinlto-split.0.o %t.split.o.thinlto-split.1.o
+
+; RUN: llvm-readobj -h %t.merged.o | FileCheck %s --check-prefix=MERGED-ELF
+; MERGED-ELF: Type: Relocatable
+
+; RUN: llvm-dwarfdump -debug-info %t.merged.o | FileCheck %s --check-prefix=MERGED-DWO
+; MERGED-DWO: DW_TAG_skeleton_unit
+; MERGED-DWO: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+; MERGED-DWO: DW_TAG_skeleton_unit
+; MERGED-DWO: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 5: -gsplit-dwarf=single path (only -split-dwarf-file, no -split-dwarf-output) ---
+; -gsplit-dwarf=single tells cc1 to embed .dwo content into the .o file and
+; not write a separate .dwo file. However, when ThinLTO split is active,
+; per-partition .dwo naming must still be used so each partition's skeleton CU
+; can reference its own .dwo. This tests that SplitDwarfOutputStem is derived
+; from SplitDwarfFile when SplitDwarfOutput is absent.
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -thinlto-split-output-list=%t.single.rsp \
+; RUN:   -split-dwarf-file %t.single.o.dwo \
+; RUN:   -o %t.single.o -x ir %t.o \
+; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; --- Step 5a: Verify per-partition .dwo files exist even without -split-dwarf-output ---
+; RUN: ls %t.single.o.thinlto-split.0.dwo
+; RUN: ls %t.single.o.thinlto-split.1.dwo
+
+; --- Step 5b: Verify skeleton CU DW_AT_dwo_name ---
+; RUN: llvm-dwarfdump -debug-info %t.single.o.thinlto-split.0.o | FileCheck %s --check-prefix=SINGLE0
+; SINGLE0: DW_TAG_skeleton_unit
+; SINGLE0: DW_AT_dwo_name{{.*}}thinlto-split.0.dwo
+
+; RUN: llvm-dwarfdump -debug-info %t.single.o.thinlto-split.1.o | FileCheck %s --check-prefix=SINGLE1
+; SINGLE1: DW_TAG_skeleton_unit
+; SINGLE1: DW_AT_dwo_name{{.*}}thinlto-split.1.dwo
+
+; --- Step 6: Verify non-split path with gsplit-dwarf still works ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -split-dwarf-file %t.nosplit.o.dwo \
+; RUN:   -split-dwarf-output %t.nosplit.o.dwo \
+; RUN:   -o %t.nosplit.o -x ir %t.o \
+; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
+; RUN:   -mllvm -thinlto-split=false
+
+; RUN: ls %t.nosplit.o.dwo
+; RUN: not ls %t.nosplit.o.thinlto-split.0.dwo 2>/dev/null
+
+; --- IR source module with debug metadata ---
+target triple = "aarch64-unknown-linux-gnu"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "source.c", directory: "/test")
+!2 = !DISubroutineType(types: !{})
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+
+define internal void @shared() !dbg !5 {
+entry:
+  ret void
+}
+
+define void @caller_a() !dbg !7 {
+entry:
+  call void @shared(), !dbg !9
+  ret void
+}
+
+define void @caller_b() !dbg !10 {
+entry:
+  call void @shared(), !dbg !12
+  ret void
+}
+
+!5 = distinct !DISubprogram(name: "shared", scope: !1, file: !1, line: 1, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!7 = distinct !DISubprogram(name: "caller_a", scope: !1, file: !1, line: 4, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!9 = !DILocation(line: 5, column: 3, scope: !7)
+!10 = distinct !DISubprogram(name: "caller_b", scope: !1, file: !1, line: 8, type: !2, spFlags: DISPFlagDefinition, unit: !0)
+!12 = !DILocation(line: 9, column: 3, scope: !10)
\ No newline at end of file
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list.ll b/clang/test/CodeGen/thinlto-split/split-output-list.ll
new file mode 100644
index 0000000000000..88abd8dd87019
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-output-list.ll
@@ -0,0 +1,75 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -thinlto-bc -o %t.o %s
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t.index \
+; RUN:   -r=%t.o,caller_a,px \
+; RUN:   -r=%t.o,caller_b,px
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.split.o -x ir %t.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 \
+; RUN:   -thinlto-split-output-list=%t.split.rsp
+; RUN: FileCheck %s --check-prefix=SPLIT-RSP --input-file=%t.split.rsp
+; RUN: llvm-nm %t.split.o.thinlto-split.0.o | FileCheck %s --check-prefix=NM0
+; RUN: llvm-nm %t.split.o.thinlto-split.1.o | FileCheck %s --check-prefix=NM1
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.skip.o -x ir %t.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -thinlto-split-output-list=%t.skip.rsp
+; RUN: FileCheck %s --check-prefix=SKIP-RSP --input-file=%t.skip.rsp
+
+; SPLIT-RSP: {{.*}}.split.o.thinlto-split.0.o
+; SPLIT-RSP-NEXT: {{.*}}.split.o.thinlto-split.1.o
+; SPLIT-RSP-NOT: {{.*}}.split.o{{$}}
+; SPLIT-RSP-NOT: thinlto-split.2.o
+; SPLIT-RSP-NOT: {{.*}}.merged.o
+; SPLIT-RSP-NOT: 4294967295
+
+; SKIP-RSP: {{.*}}.skip.o.thinlto-split.0.o
+; SKIP-RSP-NOT: {{.*}}.skip.o{{$}}
+; SKIP-RSP-NOT: thinlto-split.1.o
+; SKIP-RSP-NOT: 4294967295
+
+; Verify that the user-specified -o path is empty (0 bytes) when split codegen
+; is active. In the split path, cc1 replaces the original output stream with a
+; null stream so no object content lands in the -o file. The driver's ld.lld -r
+; merge produces the final -o content separately.
+; RUN: wc -c %t.split.o | FileCheck %s --check-prefix=EMPTY-SPLIT
+; EMPTY-SPLIT: 0
+; RUN: wc -c %t.skip.o | FileCheck %s --check-prefix=EMPTY-SKIP
+; EMPTY-SKIP: 0
+
+; NM0: T caller_b
+; NM0: T {{.*shared[._][0-9a-f]+.*}}
+; NM0-NOT: T shared{{$}}
+
+; NM1: T caller_a
+; NM1: U {{.*shared[._][0-9a-f]+.*}}
+; NM1-NOT: T shared{{$}}
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define internal void @shared() {
+entry:
+  ret void
+}
+
+define void @caller_a() {
+entry:
+  call void @shared()
+  ret void
+}
+
+define void @caller_b() {
+entry:
+  call void @shared()
+  ret void
+}
diff --git a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
similarity index 58%
rename from llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
rename to clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
index 6c51141a9ad85..2cf3aa41a9c75 100644
--- a/llvm/test/Transforms/SplitModuleCG/split-promoted-rename.ll
+++ b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
@@ -1,13 +1,21 @@
+; REQUIRES: aarch64-registered-target
 ; Test that internal symbols promoted during module splitting are consistently
 ; renamed with an MD5 suffix across all partitions.
 ;
-; RUN: opt -module-summary %s -o %t.bc
-; RUN: llvm-lto2 run %t.bc -o %t \
-; RUN:   -thinlto-split=true \
-; RUN:   -thinlto-split-partitions=2 -thinlto-split-module-size-threshold=0 \
-; RUN:   -r=%t.bc,caller_a,px \
-; RUN:   -r=%t.bc,caller_b,px
-; RUN: llvm-nm %t.1 | FileCheck %s
+; RUN: opt -thinlto-bc -o %t.o %s
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t.index \
+; RUN:   -r=%t.o,caller_a,px \
+; RUN:   -r=%t.o,caller_b,px
+
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.split.o -x ir %t.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=1 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -thinlto-split-output-list=%t.split.rsp
+; RUN: llvm-nm %t.split.o.thinlto-split.0.o | FileCheck %s
 
 ; CHECK-DAG: T caller_a
 ; CHECK-DAG: T caller_b
@@ -15,7 +23,7 @@
 ; CHECK-NOT: T promoted_internal{{$}}
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "aarch64-unknown-linux-gnu"
 
 ; @promoted_internal is internal. SplitModuleCG::dealWithMpart's checkPromoted
 ; records it in PromotedRenames. splitOptAndCodeGenThin applies the rename
diff --git a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
new file mode 100644
index 0000000000000..c024e4c37f2fb
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
@@ -0,0 +1,251 @@
+; REQUIRES: aarch64-registered-target
+
+; End-to-end test for ThinLTO split + Driver-mediated lld merge.
+;
+; This test simulates a realistic distributed ThinLTO backend scenario:
+;   1. Generate ThinLTO bitcode from an IR module containing multiple functions,
+;      global variables, global constructors (init_array), internal functions
+;      referenced across partitions, comdat/weak symbols.
+;   2. Generate the distributed ThinLTO index.
+;   3. Invoke clang Driver to compile with -fthinlto-index=... and -thinlto-split,
+;      producing partition objects and lld -r merging them into a single output.o.
+;   4. Validate the merged output.o is a valid ELF relocatable object with
+;      expected symbols, sections, and correct RSP ordering.
+
+; --- Step 1: Generate ThinLTO bitcode ---
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t.o %s
+
+; --- Step 2: Generate distributed ThinLTO index ---
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t.index \
+; RUN:   -r=%t.o,func_a,px \
+; RUN:   -r=%t.o,func_b,px \
+; RUN:   -r=%t.o,func_c,px \
+; RUN:   -r=%t.o,func_d,px \
+; RUN:   -r=%t.o,func_e,px \
+; RUN:   -r=%t.o,weak_func,px \
+; RUN:   -r=%t.o,g_global,px \
+; RUN:   -r=%t.o,g_ctor_data,px \
+; RUN:   -r=%t.o,comdat_var,px
+
+; --- Step 3: clang_cc1 split path — verify partition objects and RSP ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -thinlto-split-output-list=%t.cc1.rsp \
+; RUN:   -o %t.cc1.o -x ir %t.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
+
+; Verify RSP contains partition objects in order 0, 1
+; RSP-RSP: {{.*\.thinlto-split\.0\.o}}
+; RSP-RSP-NEXT: {{.*\.thinlto-split\.1\.o}}
+; RSP should NOT contain the bare -o output
+; RSP-RSP-NOT: {{\.cc1\.o$}}
+; RSP should NOT contain uint32_max (4294967295)
+; RSP-RSP-NOT: 4294967295
+
+; Verify partition objects are valid ELF
+; RUN: llvm-readobj -h %t.cc1.o.thinlto-split.0.o | FileCheck %s --check-prefix=PART0-ELF
+; PART0-ELF: Type: Relocatable
+
+; RUN: llvm-readobj -h %t.cc1.o.thinlto-split.1.o | FileCheck %s --check-prefix=PART1-ELF
+; PART1-ELF: Type: Relocatable
+
+; Verify inter-partition symbol promotion: shared_helper should have hash suffix
+; RUN: llvm-nm %t.cc1.o.thinlto-split.0.o | FileCheck %s --check-prefix=NM0
+; NM0-DAG: T {{shared_helper[._]}}
+; NM0-DAG: T func_
+; NM0-DAG: D g_global
+; NM0-DAG: V comdat_var
+
+; RUN: llvm-nm %t.cc1.o.thinlto-split.1.o | FileCheck %s --check-prefix=NM1
+; NM1-DAG: U {{shared_helper[._]}}
+; NM1-DAG: T func_
+
+; Verify the cc1 -o path exists but is empty (split path writes partition objects instead)
+; RUN: wc -c %t.cc1.o | FileCheck %s --check-prefix=EMPTY-O
+; EMPTY-O: 0
+
+; --- Step 4: clang Driver split path — verify full end-to-end ---
+; The Driver must: (a) invoke cc1 with -thinlto-split-output-list, (b) invoke
+; sibling ld.lld -r to merge partition objects, (c) produce a single valid output.o
+
+; First, verify the Driver generates correct command lines (-###)
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN:   -B%S/Inputs/lld \
+; RUN:   -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.driver.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=DRIVER
+
+; DRIVER: "-cc1"
+; DRIVER-SAME: "-fthinlto-index={{.*}}.thinlto.bc"
+; DRIVER-SAME: "-thinlto-split-output-list=[[RSP:[^"]+\.thinlto-split\.rsp]]"
+; DRIVER: "{{.*}}ld.lld" "-r" "-o" "{{.*}}driver.o" "@[[RSP]]"
+
+; Non-split path should NOT have -thinlto-split-output-list or ld.lld
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN:   -B%S/Inputs/lld \
+; RUN:   -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.nosplit_driver.o \
+; RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
+
+; NOSPLIT-DRIVER: "-cc1"
+; NOSPLIT-DRIVER-NOT: thinlto-split-output-list
+; NOSPLIT-DRIVER-NOT: ld.lld
+
+; --- Step 4b: Verify -save-temps + ThinLTO split ---
+; -save-temps normally prevents collapsing the assemble step, but ThinLTO split
+; must still emit objects directly (-emit-obj) because the assembly path cannot
+; produce multiple partition outputs (AcceptsMultipleOutputsPerTask).
+
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN:   -B%S/Inputs/lld \
+; RUN:   -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save.o \
+; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -thinlto-split-partitions=2 \
+; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
+; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+
+; cc1 must use -emit-obj (not -S) even with -save-temps
+; SAVE-TEMPS: "-cc1"
+; SAVE-TEMPS-SAME: "-emit-obj"
+; SAVE-TEMPS-SAME: "-thinlto-split-output-list=[[SAVE_RSP:[^"]+\.thinlto-split\.rsp]]"
+; SAVE-TEMPS: ld.lld{{.*}}-r{{.*}}-o{{.*}}@[[SAVE_RSP]]
+
+; Verify ordinary -save-temps without split still uses -S (not collapsed)
+; RUN: %clang -### -target aarch64-unknown-linux-gnu \
+; RUN:   -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save_nosplit.o \
+; RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
+
+; SAVE-TEMPS-NOSPLIT: "-cc1"
+; SAVE-TEMPS-NOSPLIT-SAME: "-S"
+; SAVE-TEMPS-NOSPLIT-NOT: thinlto-split-output-list
+
+; --- Step 5: Verify merged output.o is valid ---
+; Use clang_cc1 + ld.lld directly to verify the merge produces a valid ELF.
+; (We cannot use %clang Driver directly because it needs ld.lld in PATH,
+;  and lit test environments may not guarantee that. So we verify the merge
+;  result by manually running ld.lld on the partition objects.)
+
+; RUN: ld.lld -r -o %t.merged.o %t.cc1.o.thinlto-split.0.o %t.cc1.o.thinlto-split.1.o
+
+; merged.o must be a valid ELF relocatable object
+; RUN: llvm-readobj -h %t.merged.o | FileCheck %s --check-prefix=MERGED-ELF
+; MERGED-ELF: Type: Relocatable
+
+; merged.o must contain .init_array section (from global constructor)
+; RUN: llvm-readobj -S %t.merged.o | FileCheck %s --check-prefix=MERGED-SECTIONS
+; MERGED-SECTIONS: Name: .init_array
+; MERGED-SECTIONS: Name: .group
+
+; merged.o must contain symbols from BOTH partitions
+; RUN: llvm-nm %t.merged.o | FileCheck %s --check-prefix=MERGED-NM
+; MERGED-NM-DAG: T func_a
+; MERGED-NM-DAG: T func_b
+; MERGED-NM-DAG: T func_c
+; MERGED-NM-DAG: T func_d
+; MERGED-NM-DAG: T func_e
+; MERGED-NM-DAG: W weak_func
+; MERGED-NM-DAG: {{D|B}} g_global
+; MERGED-NM-DAG: {{D|B}} g_ctor_data
+; MERGED-NM-DAG: {{V|v}} comdat_var
+; Internal symbols promoted across partitions
+; MERGED-NM-DAG: T {{.*shared_helper[._]}}
+; MERGED-NM-DAG: T {{.*ctor_init[._]}}
+
+; --- Step 6: Verify non-split path produces correct output ---
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.nosplit.o -x ir %t.o
+
+; RUN: llvm-readobj -h %t.nosplit.o | FileCheck %s --check-prefix=NOSPLIT-ELF
+; NOSPLIT-ELF: Type: Relocatable
+
+; RUN: llvm-nm %t.nosplit.o | FileCheck %s --check-prefix=NOSPLIT-NM
+; NOSPLIT-NM-DAG: T func_a
+; NOSPLIT-NM-DAG: T func_b
+; NOSPLIT-NM-DAG: t shared_helper
+; NOSPLIT-NM-DAG: t ctor_init
+
+; Verify no partition objects leaked in non-split path
+; RUN: not ls %t.nosplit.o.thinlto-split.0.o 2>/dev/null
+; RUN: not ls %t.nosplit.o.thinlto-split.1.o 2>/dev/null
+
+; --- IR source module ---
+; Realistic module with features that stress the split+merge path:
+; - Internal function shared_helper referenced by multiple roots (promoted across partitions)
+; - Global constructor (init_array) with ctor_init in .text.startup
+; - Comdat group with weak_odr variable
+; - Weak function
+; - Multiple root functions to force 2+ partitions
+
+target triple = "aarch64-unknown-linux-gnu"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+ at g_global = global i32 42
+ at g_ctor_data = global i32 0
+
+ at llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [
+  { i32, ptr, ptr } { i32 65535, ptr @ctor_init, ptr @g_ctor_data }
+]
+
+$comdat_grp = comdat any
+ at comdat_var = weak_odr global i32 10, comdat($comdat_grp)
+
+define internal void @shared_helper() {
+entry:
+  store volatile i32 1, ptr @g_global, align 4
+  ret void
+}
+
+define weak void @weak_func() {
+entry:
+  ret void
+}
+
+define internal void @ctor_init() section ".text.startup" {
+entry:
+  store i32 100, ptr @g_ctor_data, align 4
+  ret void
+}
+
+define void @func_a() {
+entry:
+  call void @shared_helper()
+  call void @weak_func()
+  %val = load i32, ptr @comdat_var, align 4
+  %sum = add i32 %val, 1
+  store i32 %sum, ptr @g_global, align 4
+  ret void
+}
+
+define void @func_b() {
+entry:
+  call void @shared_helper()
+  store volatile i32 2, ptr @g_global, align 4
+  ret void
+}
+
+define void @func_c() {
+entry:
+  call void @shared_helper()
+  ret void
+}
+
+define void @func_d() {
+entry:
+  %v = load i32, ptr @g_global, align 4
+  %r = add i32 %v, 10
+  store i32 %r, ptr @g_global, align 4
+  call void @shared_helper()
+  ret void
+}
+
+define void @func_e() {
+entry:
+  call void @weak_func()
+  ret void
+}
\ No newline at end of file
diff --git a/clang/test/Driver/thinlto-split-merge.c b/clang/test/Driver/thinlto-split-merge.c
new file mode 100644
index 0000000000000..eb062754729ee
--- /dev/null
+++ b/clang/test/Driver/thinlto-split-merge.c
@@ -0,0 +1,64 @@
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true \
+// RUN:   -mllvm -thinlto-split-partitions=2 2>&1 | FileCheck %s --check-prefix=MERGE
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=LLD
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
+// RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
+// RUN:   -save-temps -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+// RUN: rm -rf %t.empty
+// RUN: mkdir -p %t.empty
+// RUN: not env PATH= %clang -### -ccc-install-dir %t.empty \
+// RUN:   -target aarch64-unknown-linux-gnu \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
+// RUN: %clang -### -target x86_64-unknown-freebsd \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
+// RUN: %clang -### -target x86_64-unknown-fuchsia \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
+// RUN: %clang -### -target x86_64-none-elf \
+// RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
+// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
+
+// MERGE: "-cc1"
+// MERGE-SAME: "-fthinlto-index=foo.thinlto.bc"
+// MERGE-SAME: "-thinlto-split-output-list=[[RSP:[^"]+\.thinlto-split\.rsp]]"
+// MERGE-SAME: "-o" "[[TEMP_O:[^"]+\.o]]"
+// MERGE: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[RSP]]"
+
+// LLD: "-cc1"
+// LLD-SAME: "-thinlto-split-output-list=[[LLD_RSP:[^"]+\.thinlto-split\.rsp]]"
+// LLD: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[LLD_RSP]]"
+
+// NOSPLIT: "-cc1"
+// NOSPLIT-NOT: thinlto-split-output-list
+// NOSPLIT-NOT: ld.lld
+
+// SAVE-TEMPS: "-cc1"
+// SAVE-TEMPS-SAME: "-emit-obj"
+// SAVE-TEMPS-SAME: "-thinlto-split-output-list=[[SAVE_RSP:[^"]+\.thinlto-split\.rsp]]"
+// SAVE-TEMPS: "{{.*}}/Inputs/lld/ld.lld" "-r" "-o" "foo.o" "@[[SAVE_RSP]]"
+
+// MISSING-LLD: error: cannot find 'ld.lld' required for ThinLTO split codegen
+
+// FREEBSD: "-cc1"
+// FREEBSD-SAME: "-fthinlto-index=foo.thinlto.bc"
+// FREEBSD-NOT: thinlto-split-output-list
+// FREEBSD-NOT: "-r"
+
+// FUCHSIA: "-cc1"
+// FUCHSIA-SAME: "-fthinlto-index=foo.thinlto.bc"
+// FUCHSIA-NOT: thinlto-split-output-list
+// FUCHSIA-NOT: "-r"
+
+// BAREMETAL: "-cc1"
+// BAREMETAL-SAME: "-fthinlto-index=foo.thinlto.bc"
+// BAREMETAL-NOT: thinlto-split-output-list
+// BAREMETAL-NOT: "-r"
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 27be8859a3f65..a222f01502cd3 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -22,6 +22,7 @@
 #include "llvm/DTLTO/DTLTO.h"
 #include "llvm/LTO/Config.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/LTO/LTOBackend.h"
 #include "llvm/Support/Caching.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -173,6 +174,17 @@ static lto::Config createConfig(Ctx &ctx) {
     checkError(ctx.e, c.addSaveTemps(ctx.arg.outputFile.str() + ".",
                                      /*UseInputModulePath*/ true,
                                      ctx.arg.saveTempsArgs));
+
+  // With ThinLTO split codegen the in-process backend emits one object per
+  // call-graph partition, which lld consumes directly. Opt in to multiple
+  // outputs per task and the expanded task-id layout (T*Stride+p) so partitions
+  // from different modules occupy distinct slots. Index-only and DTLTO do not
+  // run codegen in lld, so they keep the defaults.
+  if (lto::isThinLTOSplitEnabled() && !ctx.arg.thinLTOIndexOnly &&
+      ctx.arg.dtltoDistributor.empty()) {
+    c.AcceptsMultipleOutputsPerTask = true;
+    c.UseExpandedThinLTOSplitTaskIds = true;
+  }
   return c;
 }
 
@@ -382,8 +394,12 @@ SmallVector<std::unique_ptr<InputFile>, 0> BitcodeCompiler::compile() {
 
   if (!ctx.arg.ltoObjPath.empty()) {
     saveBuffer(buf[0].second, ctx.arg.ltoObjPath);
+    // With ThinLTO split codegen the task-id space is sparse (only a few of the
+    // Stride slots per module are used), so skip empty slots instead of writing
+    // a zero-length file for every gap.
     for (unsigned i = 1; i != maxTasks; ++i)
-      saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
+      if (!buf[i].second.empty())
+        saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
   }
 
   bool savePrelink = ctx.arg.saveTempsArgs.contains("prelink");
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 2aeb902bcfccf..8cc72ae5a7ea0 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -105,6 +105,20 @@ struct Config {
   /// distinguished.
   mutable bool Dtlto = 0;
 
+  /// True if the client can receive multiple native objects per logical
+  /// ThinLTO task (required to consume ThinLTO split codegen; sized via
+  /// LTO::getMaxTasks()). Clients that leave it false hard-error on a split.
+  bool AcceptsMultipleOutputsPerTask = false;
+
+  /// Report ThinLTO split partition `p` of task `T` as id
+  /// `T * ThinLTOSplitTaskIdStride + p` (disjoint ranges per module). Set by
+  /// in-process clients like lld; the distributed backend leaves it false.
+  bool UseExpandedThinLTOSplitTaskIds = false;
+
+  /// Max partition objects per task under UseExpandedThinLTOSplitTaskIds. Zero
+  /// lets LTO::getMaxTasks() pick a value; must be non-zero once backends run.
+  unsigned ThinLTOSplitTaskIdStride = 0;
+
   /// Allows non-imported definitions to get the potentially more constraining
   /// visibility from the prevailing definition. FromPrevailing is the default
   /// because it works for many binary formats. ELF can use the more optimized
@@ -152,6 +166,9 @@ struct Config {
   /// all .dwo files will be written to the same path. Not used in skeleton CU.
   std::string SplitDwarfOutput;
 
+  /// Stem for per-partition ThinLTO split .dwo files.
+  std::string SplitDwarfOutputStem;
+
   /// Optimization remarks file path.
   std::string RemarksFilename;
 
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index b7f904aebaa66..0b42009e58eb0 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -666,6 +666,11 @@ class LTO {
 
   Error checkPartiallySplit();
 
+  /// ThinLTO split task-id stride (see Config::ThinLTOSplitTaskIdStride);
+  /// cached so getMaxTasks() and runThinLTO() agree. Returns 1 when off.
+  unsigned getThinLTOSplitTaskIdStride() const;
+  mutable std::optional<unsigned> ThinLTOSplitTaskIdStrideCache;
+
   mutable bool CalledGetMaxTasks = false;
 
   // LTO mode when using Unified LTO.
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 4bb38529ec754..08551445d5381 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -34,6 +34,14 @@ class Target;
 
 namespace lto {
 
+/// Returns true if ThinLTO split codegen (`-thinlto-split`) is requested, in
+/// which case one logical task may emit several native objects.
+LLVM_ABI bool isThinLTOSplitEnabled();
+
+/// Upper bound on partition objects per task when split is active: the
+/// configured `-thinlto-split-partitions`, or 0 (no static bound) / 1 (off).
+LLVM_ABI unsigned getThinLTOSplitMaxPartitions();
+
 /// Runs middle-end LTO optimizations on \p Mod.
 LLVM_ABI bool opt(const Config &Conf, TargetMachine *TM, unsigned Task,
                   Module &Mod, bool IsThinLTO,
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a811cddfb2348..e18ddee1c7efd 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -63,6 +63,7 @@
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
+#include <limits>
 #include <optional>
 #include <set>
 
@@ -108,6 +109,14 @@ void LTO::emitRemark(OptimizationRemark &Remark) {
 static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
+
+/// Caps the auto-selected ThinLTO split partitions per module (when
+/// -thinlto-split-partitions is 0), bounding the task-id stride and the
+/// client's getMaxTasks()-sized output table.
+static cl::opt<unsigned> MaxAutoThinLTOSplitPartitions(
+    "max-auto-thinlto-split-partitions", cl::init(32), cl::Hidden,
+    cl::desc("Cap on auto-selected ThinLTO split partitions per module"));
+
 namespace llvm {
 extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
 extern cl::opt<bool> ForceImportAll;
@@ -1264,11 +1273,51 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
   return Res;
 }
 
+unsigned LTO::getThinLTOSplitTaskIdStride() const {
+  if (!Conf.UseExpandedThinLTOSplitTaskIds)
+    return 1;
+  if (ThinLTOSplitTaskIdStrideCache)
+    return *ThinLTOSplitTaskIdStrideCache;
+
+  // The stride is a hard per-task budget: splitOptAndCodeGenThin clamps the
+  // splitter to it, so the partition count can never exceed it and the task-id
+  // layout T*Stride+p stays collision-free. Prefer the configured value so
+  // getMaxTasks() (called before run()) and runThinLTO() agree on the stride.
+  unsigned Stride = getThinLTOSplitMaxPartitions();
+  if (Stride == 0) {
+    // No explicit count: size from the largest module's defined-symbol count,
+    // capped so a huge module cannot blow up the task-id space.
+    DenseMap<StringRef, GVSummaryMapTy> ModuleToDefinedGVSummaries(
+        ThinLTO.ModuleMap.size());
+    ThinLTO.CombinedIndex.collectDefinedGVSummariesPerModule(
+        ModuleToDefinedGVSummaries);
+    Stride = 1;
+    for (const auto &Mod : ModuleToDefinedGVSummaries)
+      Stride = std::max<unsigned>(Stride, Mod.second.size());
+    Stride = std::min<unsigned>(Stride, MaxAutoThinLTOSplitPartitions);
+  }
+  ThinLTOSplitTaskIdStrideCache = Stride;
+  return Stride;
+}
+
 unsigned LTO::getMaxTasks() const {
   CalledGetMaxTasks = true;
-  auto ModuleCount = ThinLTO.ModulesToCompile ? ThinLTO.ModulesToCompile->size()
-                                              : ThinLTO.ModuleMap.size();
-  return RegularLTO.ParallelCodeGenParallelismLevel + ModuleCount;
+  uint64_t ModuleCount = ThinLTO.ModulesToCompile
+                             ? ThinLTO.ModulesToCompile->size()
+                             : ThinLTO.ModuleMap.size();
+  // Split codegen reports partition `p` of task `T` as id `T*Stride+p`, whose
+  // max is `(Parallel+ModuleCount)*Stride - 1`; size the table for that. With
+  // Stride == 1 (no expansion) this is the usual `Parallel + ModuleCount`.
+  uint64_t Stride = getThinLTOSplitTaskIdStride();
+  uint64_t MaxTasks =
+      (uint64_t(RegularLTO.ParallelCodeGenParallelismLevel) + ModuleCount) *
+      Stride;
+  // Refuse to truncate the unsigned task id (would under-size the client table
+  // and let a later partition write out of bounds).
+  if (MaxTasks > std::numeric_limits<unsigned>::max())
+    report_fatal_error("ThinLTO split codegen task id space overflow; reduce "
+                       "-thinlto-split-partitions or the number of inputs.");
+  return unsigned(MaxTasks);
 }
 
 // If only some of the modules were split, we cannot correctly handle
@@ -1654,9 +1703,12 @@ class InProcessThinBackend : public CGThinBackend {
 
     if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
-               [](uint32_t V) { return V == 0; }))
+               [](uint32_t V) { return V == 0; }) ||
+        Conf.AcceptsMultipleOutputsPerTask)
       // Cache disabled or no entry for this module in the combined index or
-      // no module hash.
+      // no module hash. Also bypass when a task may emit several objects
+      // (ThinLTO split): the cache stores exactly one object per task, so split
+      // partitions would collide on one entry; run uncached instead.
       return RunThinBackend(AddStream);
 
     // The module may be cached, this helps handling it.
@@ -1774,9 +1826,12 @@ class FirstRoundThinBackend : public InProcessThinBackend {
            "Both caches for CG and IR should have matching availability");
     if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
-               [](uint32_t V) { return V == 0; }))
+               [](uint32_t V) { return V == 0; }) ||
+        Conf.AcceptsMultipleOutputsPerTask)
       // Cache disabled or no entry for this module in the combined index or
-      // no module hash.
+      // no module hash. Also bypass the cache for ThinLTO split codegen (one
+      // logical task may emit multiple partition objects; see the comment in
+      // InProcessThinBackend::runThinLTOBackendThread).
       return RunThinBackend(CGAddStream, IRAddStream);
 
     // Get CGKey for caching object in CGCache.
@@ -1860,9 +1915,12 @@ class SecondRoundThinBackend : public InProcessThinBackend {
     };
     if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
-               [](uint32_t V) { return V == 0; }))
+               [](uint32_t V) { return V == 0; }) ||
+        Conf.AcceptsMultipleOutputsPerTask)
       // Cache disabled or no entry for this module in the combined index or
-      // no module hash.
+      // no module hash. Also bypass the cache for ThinLTO split codegen (one
+      // logical task may emit multiple partition objects; see the comment in
+      // InProcessThinBackend::runThinLTOBackendThread).
       return RunThinBackend(AddStream);
 
     // Get Key for caching the final object file in Cache with the combined
@@ -2066,6 +2124,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
+  // Lock in the (cached) stride from getMaxTasks() so the client's output table
+  // and the task ids the backends produce stay consistent.
+  if (Conf.UseExpandedThinLTOSplitTaskIds && Conf.ThinLTOSplitTaskIdStride == 0)
+    Conf.ThinLTOSplitTaskIdStride = getThinLTOSplitTaskIdStride();
+
   FunctionImporter::ImportListsTy ImportLists(ThinLTO.ModuleMap.size());
   DenseMap<StringRef, FunctionImporter::ExportSetTy> ExportLists(
       ThinLTO.ModuleMap.size());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index ef78f1fa8ac3a..c65187c838bb6 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -34,10 +34,8 @@
 #include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -49,6 +47,7 @@
 #include "llvm/Transforms/Utils/SplitModule.h"
 #include "llvm/Transforms/Utils/SplitModuleCG.h"
 #include <filesystem>
+#include <limits>
 #include <optional>
 
 using namespace llvm;
@@ -105,6 +104,41 @@ namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
 
+bool lto::isThinLTOSplitEnabled() { return ThinLTOSplit; }
+
+unsigned lto::getThinLTOSplitMaxPartitions() {
+  if (!ThinLTOSplit)
+    return 1;
+  // ThinLTOSplitPartitions==0 means "decide per module from the call graph",
+  // so there is no static upper bound the client can rely on.
+  return ThinLTOSplitPartitions;
+}
+
+static unsigned getThinLTOOutputTask(const Config &C, unsigned LogicalTask,
+                                     unsigned PartitionId) {
+  if (!C.UseExpandedThinLTOSplitTaskIds)
+    return C.AcceptsMultipleOutputsPerTask ? PartitionId : LogicalTask;
+
+  if (C.ThinLTOSplitTaskIdStride == 0)
+    report_fatal_error(
+        "ThinLTO split codegen expanded task ids require a non-zero stride.");
+  if (PartitionId >= C.ThinLTOSplitTaskIdStride)
+    report_fatal_error(
+        "ThinLTO split codegen produced more partitions than the task id "
+        "stride allows.");
+
+  uint64_t PhysicalTask =
+      uint64_t(LogicalTask) * C.ThinLTOSplitTaskIdStride + PartitionId;
+  if (PhysicalTask > std::numeric_limits<unsigned>::max())
+    report_fatal_error("ThinLTO split codegen task id overflow.");
+  return unsigned(PhysicalTask);
+}
+
+static unsigned getThinLTOSingleOutputTask(const Config &C,
+                                           unsigned LogicalTask) {
+  return getThinLTOOutputTask(C, LogicalTask, /*PartitionId=*/0);
+}
+
 [[noreturn]] static void reportOpenError(StringRef Path, Twine Msg) {
   errs() << "failed to open " << Path << ": " << Msg << '\n';
   errs().flush();
@@ -486,6 +520,17 @@ static void codegen(const Config &Conf, TargetMachine *TM,
     DwoFile = Conf.DwoDir;
     sys::path::append(DwoFile, std::to_string(Task) + ".dwo");
     TM->Options.MCOptions.SplitDwarfFile = std::string(DwoFile);
+  } else if (!Conf.SplitDwarfOutputStem.empty()) {
+    DwoFile = (Twine(Conf.SplitDwarfOutputStem) + ".thinlto-split." +
+               Twine(Task) + ".dwo").str();
+    // Ensure the parent directory exists (same directory as the .o output).
+    SmallString<128> DwoParent(sys::path::parent_path(DwoFile));
+    if (!DwoParent.empty()) {
+      if (auto EC = llvm::sys::fs::create_directories(DwoParent))
+        report_fatal_error(Twine("Failed to create directory ") + DwoParent +
+                           ": " + EC.message());
+    }
+    TM->Options.MCOptions.SplitDwarfFile = std::string(DwoFile);
   } else
     TM->Options.MCOptions.SplitDwarfFile = Conf.SplitDwarfFile;
 
@@ -567,40 +612,6 @@ static bool HasLargeCG(Module &Mod, const ModuleSummaryIndex &CombinedIndex) {
   return true;
 }
 
-struct TaskIdAllocator {
-  using TaskId = unsigned;
-
-  // Use the most significant bit (MSB) as a namespace tag.
-  // - Original ThinLTO backend tasks are expected to have MSB == 0.
-  // - Split partitions allocated by this allocator always have MSB == 1.
-  // This guarantees the two ID spaces never overlap.
-  static constexpr TaskId tag() {
-    return TaskId{1} << (std::numeric_limits<TaskId>::digits - 1);
-  }
-
-  // Monotonic sequence counter for split partitions (MSB must remain 0 here).
-  std::atomic<TaskId> seq{0};
-
-  // Allocate a globally unique TaskId for a split partition.
-  // The returned ID is `tag() | seq`, so it lives in the MSB==1 namespace.
-  TaskId alloc() {
-    TaskId v = seq.fetch_add(1, std::memory_order_relaxed);
-
-    // If the counter ever reaches the MSB, we'd overlap namespaces.
-    // This indicates an overflow / too many partitions.
-    if (v & tag())
-      report_fatal_error("Partition TaskId overflow: seq reached the tag bit.");
-
-    return tag() | v;
-  }
-
-  // Helper for sanity checks / debugging.
-  static bool isPartition(TaskId id) { return (id & tag()) != 0; }
-};
-
-// Global allocator shared by all split partitions.
-static TaskIdAllocator gSplitTaskIds;
-
 static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
                                    TargetMachine *TM, AddStreamFn AddStream,
                                    unsigned ParallelCodeGenParallelismLevel,
@@ -614,37 +625,44 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
 
   static std::mutex PrintMutex;
 
-  SplitModuleCG SplitModuleCG(Mod, CombinedIndex, ParallelCodeGenParallelismLevel);
-  ParallelCodeGenParallelismLevel = SplitModuleCG.getPartitionNum();
-
-  std::vector<std::string> TempObjectFiles(ParallelCodeGenParallelismLevel);
-  std::vector<llvm::FileRemover> TempFileRemovers(ParallelCodeGenParallelismLevel);
+  // Clamp the splitter to the per-task stride budget that getMaxTasks() used to
+  // size the client's output table, so the partition count can never exceed it.
+  // Needed for the "auto" case (no -thinlto-split-partitions), where the
+  // splitter would otherwise pick one partition per call-graph root.
+  unsigned PartitionLimit = ParallelCodeGenParallelismLevel;
+  if (C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
+      (PartitionLimit == 0 || PartitionLimit > C.ThinLTOSplitTaskIdStride))
+    PartitionLimit = C.ThinLTOSplitTaskIdStride;
+
+  SplitModuleCG SplitModuleCG(Mod, CombinedIndex, PartitionLimit);
+  unsigned PartitionCount = SplitModuleCG.getPartitionNum();
+  if (!C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
+    report_fatal_error(
+        "The current LTO client does not support ThinLTO split codegen.");
+  if (C.UseExpandedThinLTOSplitTaskIds) {
+    if (C.ThinLTOSplitTaskIdStride == 0)
+      report_fatal_error(
+          "ThinLTO split codegen expanded task ids require a non-zero stride.");
+    if (PartitionCount > C.ThinLTOSplitTaskIdStride)
+      report_fatal_error(
+          "ThinLTO split codegen produced more partitions than the task id "
+          "stride allows.");
+  }
+  ParallelCodeGenParallelismLevel = PartitionCount;
 
   const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
                                          unsigned PartitionId) {
-    unsigned CurrentThreadId, UniqueTaskId;
+    unsigned CurrentThreadId;
     {
       std::lock_guard<std::mutex> Lock(PrintMutex);
       CurrentThreadId = ThreadCount++;
-
-      // In distributed ThinLTO, `task` may be a sentinel (e.g. -1 cast to
-      // unsigned), which becomes UINT_MAX and naturally has MSB==1. Treat it
-      // as "no base task id" and don't enforce the namespace check on it.
-      //
-      // We do not rely on the incoming `task` for partition uniqueness: split
-      // partitions get a dedicated UniqueTaskId allocated below.
-      if (task != std::numeric_limits<unsigned>::max()) {
-        assert(!TaskIdAllocator::isPartition(task) &&
-               "Original ThinLTO TaskId unexpectedly overlaps the partition "
-               "namespace");
-      }
-      UniqueTaskId = gSplitTaskIds.alloc();
     }
 
+    unsigned PartitionTask = getThinLTOOutputTask(C, task, PartitionId);
     std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
 
     if (DoOpt) {
-      if (!opt(C, ThreadTM.get(), UniqueTaskId, *MPart, /*IsThinLTO=*/true,
+      if (!opt(C, ThreadTM.get(), PartitionTask, *MPart, /*IsThinLTO=*/true,
                /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
                CmdArgs, BitcodeLibFuncs)) {
         report_fatal_error("Failed to gen opt for split mod in thread.");
@@ -655,10 +673,10 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
       // running `opt()`. We're not reaching here as it's bailed out earlier
       // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
       if (IRAddStream)
-        cgdata::saveModuleForTwoRounds(*MPart, task + CurrentThreadId,
+        cgdata::saveModuleForTwoRounds(*MPart, PartitionTask,
                                        IRAddStream);
     }
-    
+
     // Rename the GlobalValues whose internal is changed to external. That's
     // can avoid duplicate symbols.
     auto PromotedRenames = SplitModuleCG.getPromotedRenames();
@@ -669,88 +687,11 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
       }
     }
 
-    auto splitStream = [&](unsigned task, const Twine &moduleName)
-        -> Expected<std::unique_ptr<CachedFileStream>> {
-      int FD;
-      SmallString<128> TempFilename;
-      if (std::error_code EC = sys::fs::createTemporaryFile(
-              "thinlto-split", "o", FD, TempFilename))
-        return errorCodeToError(EC);
-
-      TempObjectFiles[PartitionId] = std::string(TempFilename.str());
-      TempFileRemovers[PartitionId].setFile(TempObjectFiles[PartitionId]);
-
-      auto OS = std::make_unique<raw_fd_ostream>(
-          FD, true, /*CloseOnDestruct*/true);
-
-      auto Stream = std::make_unique<CachedFileStream>(
-          std::move(OS), std::string(TempFilename.str()));
-
-      return std::move(Stream);
-    };
-
-    codegen(C, ThreadTM.get(), splitStream, UniqueTaskId, *MPart,
-            CombinedIndex);
+    codegen(C, ThreadTM.get(), AddStream, PartitionTask, *MPart, CombinedIndex);
   };
 
   SplitModuleCG.SplitModule(HandleModulePartition, C);
 
-  // Use ld.lld to combine the partitions into a object.
-  if (TempObjectFiles.empty()) {
-    llvm::errs() << "TempObjectFiles.empty()\n";
-    return true;
-  }
-
-  auto FinalStream = AddStream(task, Mod.getModuleIdentifier());
-  if (!FinalStream)
-    report_fatal_error("Failed to open final output stream");
-
-  SmallString<128> MergedFilename;
-  if (sys::fs::createTemporaryFile("thinlto-merged", "o", MergedFilename))
-    report_fatal_error("Failed to create merged temp file.");
-  llvm::FileRemover MergedFileRemover(MergedFilename);
-
-  std::vector<StringRef> Args;
-  std::string LinkerPath = "";
-  if (auto Path = sys::findProgramByName("ld.lld"))
-    LinkerPath = *Path;
-  else if (auto Path = sys::findProgramByName("ld"))
-    LinkerPath = *Path;
-
-  if (LinkerPath.empty())
-    report_fatal_error("Cannot find linkeer (ld or ld.lld) to merge partitions.");
-
-  Args.push_back(LinkerPath);
-  Args.push_back("-r");
-  Args.push_back("-o");
-  Args.push_back(MergedFilename);
-
-  for (const auto &File : TempObjectFiles)
-    Args.push_back(File);
-
-  std::string ErrMsg;
-  int Result = sys::ExecuteAndWait(LinkerPath, Args, /*Env=*/std::nullopt,
-                                   /*Redirects=*/{}, /*SecondsToWait=*/0,
-                                   /*MemoryLimit=*/0, &ErrMsg);
-
-  if (Result != 0) {
-    errs() << "Linker failed: " << ErrMsg << "\n";
-    report_fatal_error("Failed to merge split objects.");
-  }
-
-  {
-    std::unique_ptr<CachedFileStream> &FinalFileStream = *FinalStream;
-    auto BufferOrErr = MemoryBuffer::getFile(MergedFilename);
-    if (!BufferOrErr)
-      report_fatal_error("Failed to read merged object.");
-
-    FinalFileStream->OS->write(BufferOrErr.get()->getBufferStart(),
-                               BufferOrErr.get()->getBufferSize());
-    if (Error Err = FinalFileStream->commit()) {
-      report_fatal_error(Twine("Failed to commit final file stream: ") +
-                         toString(std::move(Err)));
-    }
-  }
   return true;
 }
 
@@ -939,7 +880,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
     else
       // If CodeGenOnly is set, we only perform code generation and skip
       // optimization. This value may differ from Conf.CodeGenOnly.
-      codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
+      codegen(Conf, TM.get(), AddStream, getThinLTOSingleOutputTask(Conf, Task),
+              Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
@@ -968,7 +910,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
           if (IRAddStream)
             cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
 
-          codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+          codegen(Conf, TM, AddStream, getThinLTOSingleOutputTask(Conf, Task),
+                  Mod, CombinedIndex);
         }
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index debdddfb79041..ae345924e4074 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -32,6 +32,11 @@ static void externalize(GlobalValue *GV) {
     GV->setName("__llvmsplit_unnamed");
 }
 
+static void dealWithDeclareDebugInfo(Module &MPart) {
+  for (Function &F : MPart)
+    if (F.isDeclaration())
+      F.setSubprogram(nullptr);
+}
 } // namespace
 
 std::vector<DenseSet<const Function *>> SplitModuleCG::doPartitioning() {
@@ -109,6 +114,7 @@ void SplitModuleCG::calculateFunctionCosts() {
 
 void SplitModuleCG::dealWithMpart(Module &MPart, unsigned I,
                                   function_ref<bool(const GlobalValue *)> NeedsConservativeImport) {
+  dealWithDeclareDebugInfo(MPart);
   // collect symbols to rename
   auto checkPromoted = [&](const GlobalValue &GV) {
     // now is external (not local), but not in external set.

>From b155e8b5fa9b687a2765a0179bfe6a09424f631c Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Thu, 11 Jun 2026 10:15:49 +0800
Subject: [PATCH 6/7] [LTO][SplitModuleCG] Enable split module by callgragh for
 FullLTO

- Rename ThinLTOSplit to LTOSplitByCG for clarity
- Add IsThinLTO parameter to splitOptAndCodeGenThin with default true
- Enable splitOptAndCodeGenThin for FullLTO via else if branch
- Adapt partition task calculation for FullLTO non-ThinLTO mode
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  4 +-
 .../split-module-by-cg-fulllto.c              | 27 ++++++++
 .../thinlto-split/split-output-list-dwo.ll    |  6 +-
 .../thinlto-split/split-output-list.ll        |  4 +-
 .../thinlto-split/split-promoted-rename.ll    |  2 +-
 .../thinlto-split-merge-realistic.ll          | 10 +--
 clang/test/Driver/thinlto-split-merge.c       | 16 ++---
 llvm/lib/LTO/LTOBackend.cpp                   | 67 ++++++++++---------
 8 files changed, 82 insertions(+), 54 deletions(-)
 create mode 100644 clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e52d8212fd496..23177ba29b734 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -946,9 +946,9 @@ bool tools::isThinLTOSplitEnabled(const ArgList &Args) {
   for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
     for (size_t I = 0, E = A->getNumValues(); I != E; ++I) {
       StringRef V = A->getValue(I);
-      if (V == "-thinlto-split" || V == "-thinlto-split=true")
+      if (V == "-lto-split-by-callgraph" || V == "-lto-split-by-callgraph=true")
         Enabled = true;
-      else if (V == "-thinlto-split=false")
+      else if (V == "-lto-split-by-callgraph=false")
         Enabled = false;
     }
   }
diff --git a/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c b/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c
new file mode 100644
index 0000000000000..c24b0ddbeac24
--- /dev/null
+++ b/clang/test/CodeGen/thinlto-split/split-module-by-cg-fulllto.c
@@ -0,0 +1,27 @@
+// REQUIRES: aarch64-registered-target
+// Test that FullLTO with callgraph-based module splitting generates
+// multiple partitions with consistent symbol renaming.
+
+// RUN: %clang -flto=full -fuse-ld=lld -shared \
+// RUN:   -o %t.o %s \
+// RUN:   -Wl,-mllvm,-lto-split-by-callgraph=true \
+// RUN:   -Wl,--lto-partitions=2 \
+// RUN:   -Wl,--save-temps=prelink
+// RUN: llvm-nm %t.o.lto.o | FileCheck %s --check-prefix=CHECK0
+// RUN: llvm-nm %t.o.lto.1.o | FileCheck %s --check-prefix=CHECK1
+
+// CHECK0-DAG: T caller_b
+// CHECK0-DAG: T promoted_internal
+
+// CHECK1-DAG: T caller_a
+// CHECK1-DAG: U promoted_internal
+
+static void promoted_internal(void) {}
+
+void caller_a(void) {
+    promoted_internal();
+}
+
+void caller_b(void) {
+    promoted_internal();
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
index e3064684e542a..2af7ceb72bbca 100644
--- a/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
+++ b/clang/test/CodeGen/thinlto-split/split-output-list-dwo.ll
@@ -30,7 +30,7 @@
 ; RUN:   -split-dwarf-output %t.split.o.dwo \
 ; RUN:   -o %t.split.o -x ir %t.o \
 ; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -102,7 +102,7 @@
 ; RUN:   -split-dwarf-file %t.single.o.dwo \
 ; RUN:   -o %t.single.o -x ir %t.o \
 ; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -127,7 +127,7 @@
 ; RUN:   -split-dwarf-output %t.nosplit.o.dwo \
 ; RUN:   -o %t.nosplit.o -x ir %t.o \
 ; RUN:   -debug-info-kind=constructor -dwarf-version=5 \
-; RUN:   -mllvm -thinlto-split=false
+; RUN:   -mllvm -lto-split-by-callgraph=false
 
 ; RUN: ls %t.nosplit.o.dwo
 ; RUN: not ls %t.nosplit.o.thinlto-split.0.dwo 2>/dev/null
diff --git a/clang/test/CodeGen/thinlto-split/split-output-list.ll b/clang/test/CodeGen/thinlto-split/split-output-list.ll
index 88abd8dd87019..a9bbe074ef345 100644
--- a/clang/test/CodeGen/thinlto-split/split-output-list.ll
+++ b/clang/test/CodeGen/thinlto-split/split-output-list.ll
@@ -9,7 +9,7 @@
 ; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
 ; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
 ; RUN:   -o %t.split.o -x ir %t.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 \
@@ -21,7 +21,7 @@
 ; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
 ; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
 ; RUN:   -o %t.skip.o -x ir %t.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -thinlto-split-output-list=%t.skip.rsp
 ; RUN: FileCheck %s --check-prefix=SKIP-RSP --input-file=%t.skip.rsp
diff --git a/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
index 2cf3aa41a9c75..19274c7c7ee72 100644
--- a/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
+++ b/clang/test/CodeGen/thinlto-split/split-promoted-rename.ll
@@ -11,7 +11,7 @@
 ; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu \
 ; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
 ; RUN:   -o %t.split.o -x ir %t.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=1 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -thinlto-split-output-list=%t.split.rsp
diff --git a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
index c024e4c37f2fb..94c0beee69e23 100644
--- a/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
+++ b/clang/test/CodeGen/thinlto-split/thinlto-split-merge-realistic.ll
@@ -33,7 +33,7 @@
 ; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
 ; RUN:   -thinlto-split-output-list=%t.cc1.rsp \
 ; RUN:   -o %t.cc1.o -x ir %t.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0
@@ -76,7 +76,7 @@
 ; RUN: %clang -### -target aarch64-unknown-linux-gnu \
 ; RUN:   -B%S/Inputs/lld \
 ; RUN:   -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.driver.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=DRIVER
@@ -90,7 +90,7 @@
 ; RUN: %clang -### -target aarch64-unknown-linux-gnu \
 ; RUN:   -B%S/Inputs/lld \
 ; RUN:   -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.nosplit_driver.o \
-; RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
+; RUN:   -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT-DRIVER
 
 ; NOSPLIT-DRIVER: "-cc1"
 ; NOSPLIT-DRIVER-NOT: thinlto-split-output-list
@@ -104,7 +104,7 @@
 ; RUN: %clang -### -target aarch64-unknown-linux-gnu \
 ; RUN:   -B%S/Inputs/lld \
 ; RUN:   -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save.o \
-; RUN:   -mllvm -thinlto-split=true \
+; RUN:   -mllvm -lto-split-by-callgraph=true \
 ; RUN:   -mllvm -thinlto-split-partitions=2 \
 ; RUN:   -mllvm -thinlto-split-module-size-threshold=0 \
 ; RUN:   -mllvm -thinlto-split-module-size-rate-threshold=2.0 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
@@ -118,7 +118,7 @@
 ; Verify ordinary -save-temps without split still uses -S (not collapsed)
 ; RUN: %clang -### -target aarch64-unknown-linux-gnu \
 ; RUN:   -save-temps -c -fthinlto-index=%t.o.thinlto.bc -x ir %t.o -o %t.save_nosplit.o \
-; RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
+; RUN:   -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS-NOSPLIT
 
 ; SAVE-TEMPS-NOSPLIT: "-cc1"
 ; SAVE-TEMPS-NOSPLIT-SAME: "-S"
diff --git a/clang/test/Driver/thinlto-split-merge.c b/clang/test/Driver/thinlto-split-merge.c
index eb062754729ee..68a0b292964c6 100644
--- a/clang/test/Driver/thinlto-split-merge.c
+++ b/clang/test/Driver/thinlto-split-merge.c
@@ -1,31 +1,31 @@
 // RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true \
+// RUN:   -mllvm -lto-split-by-callgraph=true \
 // RUN:   -mllvm -thinlto-split-partitions=2 2>&1 | FileCheck %s --check-prefix=MERGE
 // RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=LLD
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=LLD
 // RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
+// RUN:   -mllvm -lto-split-by-callgraph=false 2>&1 | FileCheck %s --check-prefix=NOSPLIT
 // RUN: %clang -### -target aarch64-unknown-linux-gnu -B%S/Inputs/lld \
 // RUN:   -save-temps -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=SAVE-TEMPS
 // RUN: rm -rf %t.empty
 // RUN: mkdir -p %t.empty
 // RUN: not env PATH= %clang -### -ccc-install-dir %t.empty \
 // RUN:   -target aarch64-unknown-linux-gnu \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=MISSING-LLD
 // RUN: %clang -### -target x86_64-unknown-freebsd \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=FREEBSD
 // RUN: %clang -### -target x86_64-unknown-fuchsia \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=FUCHSIA
 // RUN: %clang -### -target x86_64-none-elf \
 // RUN:   -c -fthinlto-index=foo.thinlto.bc -x ir %s -o foo.o \
-// RUN:   -mllvm -thinlto-split=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
+// RUN:   -mllvm -lto-split-by-callgraph=true 2>&1 | FileCheck %s --check-prefix=BAREMETAL
 
 // MERGE: "-cc1"
 // MERGE-SAME: "-fthinlto-index=foo.thinlto.bc"
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index c65187c838bb6..71ea0c5125755 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -93,25 +93,25 @@ static cl::opt<float> ThinLTOSplitModuleSizeRateThreshold(
     cl::desc("Whether to split in thinlto backend based on the ratio of "
              "(callgraph size)/(module size)"));
 
-static cl::opt<unsigned> ThinLTOSplitPartitions(
+static cl::opt<unsigned> LTOSplitPartitions(
     "thinlto-split-partitions", cl::Hidden, cl::init(0),
     cl::desc("Control split to how many partitions in thinlto backend."));
 
-static cl::opt<bool> ThinLTOSplit("thinlto-split", cl::init(false),
+static cl::opt<bool> LTOSplitByCG("lto-split-by-callgraph", cl::init(false),
 			   cl::desc("Enable split module in thinlto backend."));
 
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
 
-bool lto::isThinLTOSplitEnabled() { return ThinLTOSplit; }
+bool lto::isThinLTOSplitEnabled() { return LTOSplitByCG; }
 
 unsigned lto::getThinLTOSplitMaxPartitions() {
-  if (!ThinLTOSplit)
+  if (!LTOSplitByCG)
     return 1;
-  // ThinLTOSplitPartitions==0 means "decide per module from the call graph",
+  // LTOSplitPartitions==0 means "decide per module from the call graph",
   // so there is no static upper bound the client can rely on.
-  return ThinLTOSplitPartitions;
+  return LTOSplitPartitions;
 }
 
 static unsigned getThinLTOOutputTask(const Config &C, unsigned LogicalTask,
@@ -190,7 +190,7 @@ Error Config::addSaveTemps(std::string OutputFileName, bool UseInputModulePath,
       // named from the provided OutputFileName with the Task ID appended.
       if (M.getModuleIdentifier() == "ld-temp.o" || !UseInputModulePath) {
         PathPrefix = OutputFileName;
-        if (ThinLTOSplit)
+        if (LTOSplitByCG)
           PathPrefix += extract_filename(M.getSourceFileName()) + ".";
         if (Task != (unsigned)-1)
           PathPrefix += utostr(Task) + ".";
@@ -619,27 +619,26 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
                                    const ModuleSummaryIndex &CombinedIndex,
                                    const std::vector<uint8_t> &CmdArgs,
                                    bool DoOpt, AddStreamFn IRAddStream,
-                                   ArrayRef<StringRef> &BitcodeLibFuncs) {
+                                   ArrayRef<StringRef> &BitcodeLibFuncs,
+                                   bool IsThinLTO = true) {
   unsigned ThreadCount = 0;
   const Target *T = &TM->getTarget();
-
-  static std::mutex PrintMutex;
-
   // Clamp the splitter to the per-task stride budget that getMaxTasks() used to
   // size the client's output table, so the partition count can never exceed it.
   // Needed for the "auto" case (no -thinlto-split-partitions), where the
   // splitter would otherwise pick one partition per call-graph root.
   unsigned PartitionLimit = ParallelCodeGenParallelismLevel;
-  if (C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
+  if (IsThinLTO &&
+      C.UseExpandedThinLTOSplitTaskIds && C.ThinLTOSplitTaskIdStride != 0 &&
       (PartitionLimit == 0 || PartitionLimit > C.ThinLTOSplitTaskIdStride))
     PartitionLimit = C.ThinLTOSplitTaskIdStride;
 
   SplitModuleCG SplitModuleCG(Mod, CombinedIndex, PartitionLimit);
   unsigned PartitionCount = SplitModuleCG.getPartitionNum();
-  if (!C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
+  if (IsThinLTO && !C.AcceptsMultipleOutputsPerTask && PartitionCount > 1)
     report_fatal_error(
         "The current LTO client does not support ThinLTO split codegen.");
-  if (C.UseExpandedThinLTOSplitTaskIds) {
+  if (IsThinLTO && C.UseExpandedThinLTOSplitTaskIds) {
     if (C.ThinLTOSplitTaskIdStride == 0)
       report_fatal_error(
           "ThinLTO split codegen expanded task ids require a non-zero stride.");
@@ -652,13 +651,8 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
 
   const auto HandleModulePartition = [&](std::unique_ptr<Module> MPart,
                                          unsigned PartitionId) {
-    unsigned CurrentThreadId;
-    {
-      std::lock_guard<std::mutex> Lock(PrintMutex);
-      CurrentThreadId = ThreadCount++;
-    }
-
-    unsigned PartitionTask = getThinLTOOutputTask(C, task, PartitionId);
+    unsigned PartitionTask = IsThinLTO ?
+        getThinLTOOutputTask(C, task, PartitionId) : PartitionId;
     std::unique_ptr<TargetMachine> ThreadTM = createTargetMachine(C, T, *MPart);
 
     if (DoOpt) {
@@ -677,13 +671,15 @@ static bool splitOptAndCodeGenThin(unsigned task, const Config &C,
                                        IRAddStream);
     }
 
-    // Rename the GlobalValues whose internal is changed to external. That's
-    // can avoid duplicate symbols.
-    auto PromotedRenames = SplitModuleCG.getPromotedRenames();
-    for (auto &GV : MPart->global_values()) {
-      if (auto It = PromotedRenames.find(GV.getName());
-          It != PromotedRenames.end()) {
-        GV.setName(It->second);
+    if (IsThinLTO) {
+      // Rename the GlobalValues whose internal is changed to external. That's
+      // can avoid duplicate symbols int ThinLTO.
+      auto PromotedRenames = SplitModuleCG.getPromotedRenames();
+      for (auto &GV : MPart->global_values()) {
+        if (auto It = PromotedRenames.find(GV.getName());
+            It != PromotedRenames.end()) {
+          GV.setName(It->second);
+        }
       }
     }
 
@@ -796,6 +792,11 @@ Error lto::backend(const Config &C, AddStreamFn AddStream,
 
   if (ParallelCodeGenParallelismLevel == 1) {
     codegen(C, TM.get(), AddStream, 0, Mod, CombinedIndex);
+  } else if (LTOSplitByCG) {
+    splitOptAndCodeGenThin(/*Task*/0, C, TM.get(), AddStream,
+                           ParallelCodeGenParallelismLevel, Mod, CombinedIndex,
+                           /*CmdArgs*/ std::vector<uint8_t>(), /*DoOpt*/false,
+                            AddStreamFn(), BitcodeLibFuncs, false);
   } else {
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel, Mod,
                  CombinedIndex);
@@ -860,7 +861,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   bool ProfitableToSplit = true;
-  if (ThinLTOSplit) {
+  if (LTOSplitByCG) {
     if (!canDoSplitModule(Mod) || !HasLargeCG(Mod, CombinedIndex)) {
       ProfitableToSplit = false;
       LLVM_DEBUG(dbgs() << "warning: thinlto split not enable for module: "
@@ -873,9 +874,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
   if (CodeGenOnly) {
-    if (ThinLTOSplit && ProfitableToSplit)
+    if (LTOSplitByCG && ProfitableToSplit)
       splitOptAndCodeGenThin(Task, Conf, TM.get(), AddStream,
-                             ThinLTOSplitPartitions, Mod, CombinedIndex,
+                             LTOSplitPartitions, Mod, CombinedIndex,
                              CmdArgs, false, IRAddStream, BitcodeLibFuncs);
     else
       // If CodeGenOnly is set, we only perform code generation and skip
@@ -891,9 +892,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           LLVMRemarkFileHandle DiagnosticOutputFile) {
-        if (ThinLTOSplit && ProfitableToSplit) {
+        if (LTOSplitByCG && ProfitableToSplit) {
           if (!splitOptAndCodeGenThin(
-                  Task, Conf, TM, AddStream, ThinLTOSplitPartitions, Mod,
+                  Task, Conf, TM, AddStream, LTOSplitPartitions, Mod,
                   CombinedIndex, CmdArgs, true, IRAddStream, BitcodeLibFuncs))
             return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
         } else {

>From 71c600ffb4c7220fc653805b625789261d36748b Mon Sep 17 00:00:00 2001
From: maojiaping <maojiaping1 at huawei.com>
Date: Fri, 12 Jun 2026 15:17:07 +0800
Subject: [PATCH 7/7] [ThinLTO][SplitModuleCG] Fix initialization order in
 constructor(NFC)

Fix -Wreorder-ctor error by matching member initializer order to
declaration order in the class (N, M, CG).
---
 llvm/lib/Transforms/Utils/SplitModuleCG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
index ae345924e4074..e2ec77bf61bd4 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleCG.cpp
@@ -305,7 +305,7 @@ void SplitModuleCG::SplitModule(ModuleCreationCallback ModuleCallback,
 SplitModuleCG::SplitModuleCG(Module &M,
                              const ModuleSummaryIndex &CombinedIndex,
                              unsigned LimitPartition)
-    : M(M), CG(M), N(LimitPartition) {
+    : N(LimitPartition), M(M), CG(M) {
   // Track existing non-local symbols. This ensures that when we promote
   // internal symbols to external for partitioning, we can handle renaming
   // and avoid conflicts.



More information about the cfe-commits mailing list