[llvm] [DTLTO][LLVM] Integrated Distributed ThinLTO (DTLTO) (PR #127749)
Steven Wu via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 13:50:32 PST 2025
================
@@ -2142,3 +2195,319 @@ std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
});
return ModulesOrdering;
}
+
+namespace {
+// For this out-of-process backend no codegen is done when invoked for each
+// task. Instead we generate the required information (e.g. the summary index
+// shard, import list, etc..) to allow for the codegen to be performed
+// externally . This backend's `wait` function then invokes an external
+// distributor process to do backend compilations.
+class OutOfProcessThinBackend : public CGThinBackend {
+ using SString = SmallString<128>;
+
+ AddBufferFn AddBuffer;
+
+ BumpPtrAllocator Alloc;
+ StringSaver Saver{Alloc};
+
+ SString LinkerOutputFile;
+ SString DistributorPath;
+ bool SaveTemps;
+
+ SmallVector<StringRef, 0> CodegenOptions;
+ DenseSet<StringRef> AdditionalInputs;
+
+ // Information specific to individual backend compilation job.
+ struct Job {
+ unsigned Task;
+ StringRef ModuleID;
+ StringRef Triple;
+ StringRef NativeObjectPath;
+ StringRef SummaryIndexPath;
+ ImportsFilesContainer ImportFiles;
+ };
+ // The set of backend compilations jobs.
+ SmallVector<Job> Jobs;
+
+ // A unique string to identify the current link.
+ SmallString<8> UID;
+
+ // The first ReservedTasks entries in the task range are used for Full LTO.
+ unsigned ReservedTasks;
+
+public:
+ OutOfProcessThinBackend(
+ const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+ ThreadPoolStrategy ThinLTOParallelism,
+ const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ AddStreamFn AddStream, AddBufferFn AddBuffer,
+ lto::IndexWriteCallback OnWrite, bool ShouldEmitIndexFiles,
+ bool ShouldEmitImportsFiles, StringRef LinkerOutputFile,
+ StringRef Distributor, bool SaveTemps)
+ : CGThinBackend(Conf, CombinedIndex, ModuleToDefinedGVSummaries, OnWrite,
+ ShouldEmitIndexFiles, ShouldEmitImportsFiles,
+ ThinLTOParallelism),
+ AddBuffer(std::move(AddBuffer)), LinkerOutputFile(LinkerOutputFile),
+ DistributorPath(Distributor), SaveTemps(SaveTemps) {}
+
+ virtual void setup(unsigned MaxTasks, unsigned ReservedTasks) override {
+ UID = itostr(sys::Process::getProcessId());
+ Jobs.resize((size_t)MaxTasks);
+ this->ReservedTasks = ReservedTasks;
+ }
+
+ Error start(
+ unsigned Task, BitcodeModule BM,
+ const FunctionImporter::ImportMapTy &ImportList,
+ const FunctionImporter::ExportSetTy &ExportList,
+ const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+ MapVector<StringRef, BitcodeModule> &ModuleMap,
+ DenseMap<StringRef, std::string> &ModuleTriples) override {
+
+ StringRef ModulePath = BM.getModuleIdentifier();
+
+ SString ObjFilePath = sys::path::parent_path(LinkerOutputFile);
+ sys::path::append(ObjFilePath, sys::path::stem(ModulePath) + "." +
+ itostr(Task) + "." + UID + ".native.o");
+
+ Job &J = Jobs[Task - ReservedTasks];
+ J = {Task,
+ ModulePath,
+ ModuleTriples[ModulePath],
+ Saver.save(ObjFilePath.str()),
+ Saver.save(ObjFilePath.str() + ".thinlto.bc"),
+ {}};
+
+ assert(ModuleToDefinedGVSummaries.count(ModulePath));
+ BackendThreadPool.async(
+ [=](Job &J, const FunctionImporter::ImportMapTy &ImportList) {
+ if (auto E = emitFiles(ImportList, J.ModuleID, J.SummaryIndexPath,
+ J.ModuleID.str(), J.ImportFiles)) {
+ std::unique_lock<std::mutex> L(ErrMu);
+ if (Err)
+ Err = joinErrors(std::move(*Err), std::move(E));
+ else
+ Err = std::move(E);
+ }
+ },
+ std::ref(J), std::ref(ImportList));
+
+ return Error::success();
+ }
+
+ // Derive a set of Clang options that will be shared/common for all DTLTO
+ // backend compilations. We are intentionally minimal here as these options
+ // must remain synchronized with the behavior of Clang. DTLTO does not support
+ // all the features available with in-process LTO. More features are expected
+ // to be added over time. Users can specify Clang options directly if a
+ // feature is not supported. Note that explicitly specified options that imply
+ // additional input or output file dependencies must be communicated to the
+ // distribution system, potentially by setting extra options on the
+ // distributor program.
+ // TODO: If this strategy of deriving options proves insufficient, alternative
+ // approaches should be considered, such as:
+ // - A serialization/deserialization format for LTO configuration.
+ // - Modifying LLD to be the tool that performs the backend compilations.
+ void buildCommonRemoteCompilerOptions() {
+ const lto::Config &C = Conf;
+ auto &Ops = CodegenOptions;
+ llvm::Triple TT{Jobs.front().Triple};
+
+ Ops.push_back(Saver.save("-O" + Twine(C.OptLevel)));
+
+ if (C.Options.EmitAddrsig)
+ Ops.push_back("-faddrsig");
+ if (C.Options.FunctionSections)
+ Ops.push_back("-ffunction-sections");
+ if (C.Options.DataSections)
+ Ops.push_back("-fdata-sections");
+
+ if (C.RelocModel == Reloc::PIC_)
+ // Clang doesn't have -fpic for all triples.
+ if (!TT.isOSBinFormatCOFF())
+ Ops.push_back("-fpic");
+
+ // Turn on/off warnings about profile cfg mismatch (default on)
+ // --lto-pgo-warn-mismatch.
+ if (!C.PGOWarnMismatch) {
+ Ops.push_back("-mllvm");
+ Ops.push_back("-no-pgo-warn-mismatch");
+ }
+
+ // Enable sample-based profile guided optimizations.
+ // Sample profile file path --lto-sample-profile=<value>.
+ if (!C.SampleProfile.empty()) {
+ Ops.push_back(
+ Saver.save("-fprofile-sample-use=" + Twine(C.SampleProfile)));
+ AdditionalInputs.insert(C.SampleProfile);
+ }
+
+ // We don't know which of options will be used by Clang.
+ Ops.push_back("-Wno-unused-command-line-argument");
+
+ // Forward any supplied options.
+ if (!ThinLTORemoteCompilerArgs.empty())
+ for (auto &a : ThinLTORemoteCompilerArgs)
+ Ops.push_back(a);
+ }
+
+ // Generates a JSON file describing the backend compilations, for the
+ // distributor.
+ bool emitDistributorJson(StringRef DistributorJson) {
+ using json::Array;
+ std::error_code EC;
+ raw_fd_ostream OS(DistributorJson, EC);
+ if (EC)
+ return false;
+
+ json::OStream JOS(OS);
+ JOS.object([&]() {
+ // Information common to all jobs note that we use a custom syntax for
+ // referencing by index into the job input and output file arrays.
+ JOS.attributeObject("common", [&]() {
+ JOS.attribute("linker_output", LinkerOutputFile);
+
+ // Common command line template.
+ JOS.attributeArray("args", [&]() {
+ JOS.value(ThinLTORemoteCompiler);
+
+ // Reference to Job::NativeObjectPath.
+ JOS.value("-o");
+ JOS.value(Array{"primary_output", 0});
+
+ JOS.value("-c");
+
+ JOS.value("-x");
+ JOS.value("ir");
+
+ // Reference to Job::ModuleID.
+ JOS.value(Array{"primary_input", 0});
+
+ // Reference to Job::SummaryIndexPath.
+ JOS.value(Array{"summary_index", "-fthinlto-index=", 0});
+ JOS.value(Saver.save("--target=" + Twine(Jobs.front().Triple)));
+
+ for (const auto &A : CodegenOptions)
+ JOS.value(A);
+ });
+ });
+ JOS.attributeArray("jobs", [&]() {
+ for (const auto &J : Jobs) {
+ assert(J.Task != 0);
+ JOS.object([&]() {
+ JOS.attribute("primary_input", Array{J.ModuleID});
+ JOS.attribute("summary_index", Array{J.SummaryIndexPath});
+ JOS.attribute("primary_output", Array{J.NativeObjectPath});
+
+ // Add the bitcode files from which imports will be made. These do
+ // not appear on the command line but are recorded in the summary
+ // index shard.
+ JOS.attribute("imports", Array(J.ImportFiles));
+
+ // Add any input files that are common to each invocation. These
+ // filenames are duplicated in the command line template and in
+ // each of the per job "inputs" array. However, this small amount
+ // of duplication makes the schema simpler.
+ JOS.attribute("additional_inputs", Array(AdditionalInputs));
+ });
+ }
+ });
+ });
+
+ return true;
+ }
+
+ void removeFile(StringRef FileName) {
+ std::error_code EC = sys::fs::remove(FileName, true);
+ if (EC && EC != std::make_error_code(std::errc::no_such_file_or_directory))
+ errs() << "warning: could not remove the file '" << FileName
+ << "': " << EC.message() << "\n";
+ }
+
+ Error wait() override {
+ // Wait for the information on the required backend compilations to be
+ // gathered.
+ BackendThreadPool.wait();
+ if (Err)
+ return std::move(*Err);
+
+ auto CleanPerJobFiles = llvm::make_scope_exit([&] {
+ if (!SaveTemps)
+ for (auto &Job : Jobs) {
+ removeFile(Job.NativeObjectPath);
+ if (!ShouldEmitIndexFiles)
+ removeFile(Job.SummaryIndexPath);
+ }
+ });
+
+ const StringRef BCError = "DTLTO backend compilation: ";
+
+ // TODO: If we move to using an optimisation tool that does not require an
+ // explicit triple to be passed then the triple handling can be removed
+ // entirely.
+ if (!llvm::all_of(Jobs, [&](const auto &Job) {
+ return Job.Triple == Jobs.front().Triple;
+ }))
+ return make_error<StringError>(BCError + "all triples must be consistent",
+ inconvertibleErrorCode());
+
+ buildCommonRemoteCompilerOptions();
+
+ SString JsonFile = sys::path::parent_path(LinkerOutputFile);
+ sys::path::append(JsonFile, sys::path::stem(LinkerOutputFile) + "." + UID +
+ ".dist-file.json");
+ if (!emitDistributorJson(JsonFile))
+ return make_error<StringError>(
+ BCError + "failed to generate distributor JSON script: " + JsonFile,
+ inconvertibleErrorCode());
+ auto CleanJson = llvm::make_scope_exit([&] {
+ if (!SaveTemps)
+ removeFile(JsonFile);
+ });
+
+ SmallVector<StringRef, 3> Args = {DistributorPath};
+ llvm::append_range(Args, AdditionalThinLTODistributorArgs);
+ Args.push_back(JsonFile);
+ std::string ErrMsg;
+ if (sys::ExecuteAndWait(Args[0], Args,
+ /*Env=*/std::nullopt, /*Redirects=*/{},
+ /*SecondsToWait=*/0, /*MemoryLimit=*/0, &ErrMsg)) {
+ return make_error<StringError>(
+ BCError + "distributor execution failed" +
+ (!ErrMsg.empty() ? ": " + ErrMsg + Twine(".") : Twine(".")),
+ inconvertibleErrorCode());
+ }
+
+ for (auto &Job : Jobs) {
----------------
cachemeifyoucan wrote:
Maybe using the thread pool again here to read file in parallel?
Also why specifically pass `AddBuffer` function all the way here since `AddStream` pretty much does the same thing?
https://github.com/llvm/llvm-project/pull/127749
More information about the llvm-commits
mailing list