[llvm] [DTLTO][LLVM] Integrated Distributed ThinLTO (DTLTO) (PR #127749)
Katya Romanova via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 16 02:09:50 PDT 2025
================
@@ -2142,3 +2196,324 @@ std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
});
return ModulesOrdering;
}
+
+namespace {
+// For this out-of-process backend no codegen is done when invoked for each
+// task. Instead we generate the required information (e.g. the summary index
+// shard, import list, etc..) to allow for the codegen to be performed
+// externally . This backend's `wait` function then invokes an external
+// distributor process to do backend compilations.
+class OutOfProcessThinBackend : public CGThinBackend {
+ using SString = SmallString<128>;
+
+ AddBufferFn AddBuffer;
+
+ BumpPtrAllocator Alloc;
+ StringSaver Saver{Alloc};
+
+ SString LinkerOutputFile;
+ SString DistributorPath;
+ bool SaveTemps;
+
+ SmallVector<StringRef, 0> CodegenOptions;
+ DenseSet<StringRef> AdditionalInputs;
+
+ // Information specific to individual backend compilation job.
+ struct Job {
+ unsigned Task;
+ StringRef ModuleID;
+ StringRef Triple;
+ StringRef NativeObjectPath;
+ StringRef SummaryIndexPath;
+ ImportsFilesContainer ImportFiles;
+ };
+ // The set of backend compilations jobs.
+ SmallVector<Job> Jobs;
+
+ // A unique string to identify the current link.
+ SmallString<8> UID;
+
+ // The first ReservedTasks entries in the task range are used for Full LTO.
+ unsigned ReservedTasks;
+
+public:
+ OutOfProcessThinBackend(
+ const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+ ThreadPoolStrategy ThinLTOParallelism,
+ const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ AddStreamFn AddStream, AddBufferFn AddBuffer,
+ lto::IndexWriteCallback OnWrite, bool ShouldEmitIndexFiles,
+ bool ShouldEmitImportsFiles, StringRef LinkerOutputFile,
+ StringRef Distributor, bool SaveTemps)
+ : CGThinBackend(Conf, CombinedIndex, ModuleToDefinedGVSummaries, OnWrite,
+ ShouldEmitIndexFiles, ShouldEmitImportsFiles,
+ ThinLTOParallelism),
+ AddBuffer(std::move(AddBuffer)), LinkerOutputFile(LinkerOutputFile),
+ DistributorPath(Distributor), SaveTemps(SaveTemps) {}
+
+ virtual void setup(unsigned MaxTasks, unsigned ReservedTasks) override {
+ UID = itostr(sys::Process::getProcessId());
+ Jobs.resize((size_t)MaxTasks);
+ this->ReservedTasks = ReservedTasks;
+ }
+
+ Error start(
+ unsigned Task, BitcodeModule BM,
+ const FunctionImporter::ImportMapTy &ImportList,
+ const FunctionImporter::ExportSetTy &ExportList,
+ const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+ MapVector<StringRef, BitcodeModule> &ModuleMap,
+ DenseMap<StringRef, std::string> &ModuleTriples) override {
+
+ StringRef ModulePath = BM.getModuleIdentifier();
+
+ SString ObjFilePath = sys::path::parent_path(LinkerOutputFile);
+ sys::path::append(ObjFilePath, sys::path::stem(ModulePath) + "." +
+ itostr(Task) + "." + UID + ".native.o");
+
+ Job &J = Jobs[Task - ReservedTasks];
+ J = {Task,
+ ModulePath,
+ ModuleTriples[ModulePath],
+ Saver.save(ObjFilePath.str()),
+ Saver.save(ObjFilePath.str() + ".thinlto.bc"),
+ {}};
+
+ assert(ModuleToDefinedGVSummaries.count(ModulePath));
+ BackendThreadPool.async(
+ [=](Job &J, const FunctionImporter::ImportMapTy &ImportList) {
+ if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
+ timeTraceProfilerInitialize(Conf.TimeTraceGranularity,
+ "thin backend");
+ if (auto E = emitFiles(ImportList, J.ModuleID, J.SummaryIndexPath,
+ J.ModuleID.str(), J.ImportFiles)) {
+ std::unique_lock<std::mutex> L(ErrMu);
+ if (Err)
+ Err = joinErrors(std::move(*Err), std::move(E));
+ else
+ Err = std::move(E);
+ }
+ if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
+ timeTraceProfilerFinishThread();
+ },
+ std::ref(J), std::ref(ImportList));
+
+ return Error::success();
+ }
+
+ // Derive a set of Clang options that will be shared/common for all DTLTO
+ // backend compilations. We are intentionally minimal here as these options
+ // must remain synchronized with the behavior of Clang. DTLTO does not support
+ // all the features available with in-process LTO. More features are expected
+ // to be added over time. Users can specify Clang options directly if a
+ // feature is not supported. Note that explicitly specified options that imply
+ // additional input or output file dependencies must be communicated to the
+ // distribution system, potentially by setting extra options on the
+ // distributor program.
+ // TODO: If this strategy of deriving options proves insufficient, alternative
+ // approaches should be considered, such as:
+ // - A serialization/deserialization format for LTO configuration.
+ // - Modifying LLD to be the tool that performs the backend compilations.
+ void buildCommonRemoteCompilerOptions() {
+ const lto::Config &C = Conf;
+ auto &Ops = CodegenOptions;
+ llvm::Triple TT{Jobs.front().Triple};
+
+ Ops.push_back(Saver.save("-O" + Twine(C.OptLevel)));
+
+ if (C.Options.EmitAddrsig)
+ Ops.push_back("-faddrsig");
+ if (C.Options.FunctionSections)
+ Ops.push_back("-ffunction-sections");
+ if (C.Options.DataSections)
+ Ops.push_back("-fdata-sections");
+
+ if (C.RelocModel == Reloc::PIC_)
+ // Clang doesn't have -fpic for all triples.
+ if (!TT.isOSBinFormatCOFF())
+ Ops.push_back("-fpic");
+
+ // Turn on/off warnings about profile cfg mismatch (default on)
+ // --lto-pgo-warn-mismatch.
+ if (!C.PGOWarnMismatch) {
+ Ops.push_back("-mllvm");
+ Ops.push_back("-no-pgo-warn-mismatch");
+ }
+
+ // Enable sample-based profile guided optimizations.
+ // Sample profile file path --lto-sample-profile=<value>.
+ if (!C.SampleProfile.empty()) {
+ Ops.push_back(
+ Saver.save("-fprofile-sample-use=" + Twine(C.SampleProfile)));
+ AdditionalInputs.insert(C.SampleProfile);
+ }
+
+ // We don't know which of options will be used by Clang.
+ Ops.push_back("-Wno-unused-command-line-argument");
+
+ // Forward any supplied options.
+ if (!ThinLTORemoteCompilerArgs.empty())
+ for (auto &a : ThinLTORemoteCompilerArgs)
+ Ops.push_back(a);
+ }
+
+ // Generates a JSON file describing the backend compilations, for the
+ // distributor.
+ bool emitDistributorJson(StringRef DistributorJson) {
+ using json::Array;
+ std::error_code EC;
+ raw_fd_ostream OS(DistributorJson, EC);
+ if (EC)
+ return false;
+
+ json::OStream JOS(OS);
+ JOS.object([&]() {
+ // Information common to all jobs note that we use a custom syntax for
+ // referencing by index into the job input and output file arrays.
+ JOS.attributeObject("common", [&]() {
+ JOS.attribute("linker_output", LinkerOutputFile);
+
+ // Common command line template.
+ JOS.attributeArray("args", [&]() {
+ JOS.value(ThinLTORemoteCompiler);
+
+ // Reference to Job::NativeObjectPath.
+ JOS.value("-o");
+ JOS.value(Array{"primary_output", 0});
+
+ JOS.value("-c");
+
+ JOS.value("-x");
+ JOS.value("ir");
+
+ // Reference to Job::ModuleID.
+ JOS.value(Array{"primary_input", 0});
+
+ // Reference to Job::SummaryIndexPath.
+ JOS.value(Array{"summary_index", "-fthinlto-index=", 0});
+ JOS.value(Saver.save("--target=" + Twine(Jobs.front().Triple)));
+
+ for (const auto &A : CodegenOptions)
+ JOS.value(A);
+ });
+ });
+ JOS.attributeArray("jobs", [&]() {
+ for (const auto &J : Jobs) {
+ assert(J.Task != 0);
+ JOS.object([&]() {
+ JOS.attribute("primary_input", Array{J.ModuleID});
+ JOS.attribute("summary_index", Array{J.SummaryIndexPath});
+ JOS.attribute("primary_output", Array{J.NativeObjectPath});
+
+ // Add the bitcode files from which imports will be made. These do
+ // not appear on the command line but are recorded in the summary
+ // index shard.
+ JOS.attribute("imports", Array(J.ImportFiles));
+
+ // Add any input files that are common to each invocation. These
+ // filenames are duplicated in the command line template and in
+ // each of the per job "inputs" array. However, this small amount
+ // of duplication makes the schema simpler.
+ JOS.attribute("additional_inputs", Array(AdditionalInputs));
+ });
+ }
+ });
+ });
+
+ return true;
+ }
+
+ void removeFile(StringRef FileName) {
+ std::error_code EC = sys::fs::remove(FileName, true);
+ if (EC && EC != std::make_error_code(std::errc::no_such_file_or_directory))
+ errs() << "warning: could not remove the file '" << FileName
+ << "': " << EC.message() << "\n";
+ }
+
+ Error wait() override {
+ // Wait for the information on the required backend compilations to be
+ // gathered.
+ BackendThreadPool.wait();
+ if (Err)
+ return std::move(*Err);
+
+ auto CleanPerJobFiles = llvm::make_scope_exit([&] {
+ if (!SaveTemps)
+ for (auto &Job : Jobs) {
+ removeFile(Job.NativeObjectPath);
+ if (!ShouldEmitIndexFiles)
+ removeFile(Job.SummaryIndexPath);
+ }
+ });
+
+ const StringRef BCError = "DTLTO backend compilation: ";
+
+ // TODO: If we move to using an optimisation tool that does not require an
----------------
romanova-ekaterina wrote:
> Due to issues like this, where the DTLTO backend has quite different requirements from the existing backends, my colleagues have been experimenting with not having DTLTO run as a ThinLTO backend at all. Instead, for DTLTO, LLD would call separate code to run the thin-link, generate the required information for the backend compilations, perform the backend compilations, and insert the resulting native object files into the link. Hopefully that approach will be ready to show soon for you (and other reviewers) to contrast with the current one.
The “No backend DTLTO” branch is ready. Please have a look and let us know what you think. https://github.com/romanova-ekaterina/llvm-project/pull/new/kromanova/main/integrated-DTLTO-no-backend
This comment https://github.com/llvm/llvm-project/pull/127749#issuecomment-2727266591 has more details about the differences between “Out of process (DTLTO) backend” branch and “No backend” DTLTO branch.
https://github.com/llvm/llvm-project/pull/127749
More information about the llvm-commits
mailing list