[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (PR #104763)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 20 10:33:40 PDT 2024
================
@@ -651,138 +1317,176 @@ static void splitAMDGPUModule(
if (!NoExternalizeGlobals) {
for (auto &GV : M.globals()) {
if (GV.hasLocalLinkage())
- SML << "[externalize] GV " << GV.getName() << '\n';
+ LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
externalize(GV);
}
}
// Start by calculating the cost of every function in the module, as well as
// the module's overall cost.
- DenseMap<const Function *, CostType> FnCosts;
- const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
-
- // First, gather ever kernel into the worklist.
- SmallVector<FunctionWithDependencies> WorkList;
- for (auto &Fn : M) {
- if (isEntryPoint(&Fn) && !Fn.isDeclaration())
- WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+ FunctionsCostMap FnCosts;
+ const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
+
+ // Build the SplitGraph, which represents the module's functions and models
+ // their dependencies accurately.
+ SplitGraph SG(M, FnCosts, ModuleCost);
+ SG.buildGraph(CG);
+
+ if (SG.empty()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "[!] no nodes in graph, input is empty - no splitting possible\n");
+ ModuleCallback(cloneAll(M));
+ return;
}
- // Then, find missing functions that need to be considered as additional
- // roots. These can't be called in theory, but in practice we still have to
- // handle them to avoid linker errors.
- {
- DenseSet<const Function *> SeenFunctions;
- for (const auto &FWD : WorkList) {
- SeenFunctions.insert(FWD.Fn);
- SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ LLVM_DEBUG({
+ dbgs() << "[graph] nodes:\n";
+ for (const SplitGraph::Node *N : SG.nodes()) {
+ dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
+ << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
}
+ });
- for (auto &Fn : M) {
- // If this function is not part of any kernel's dependencies and isn't
- // directly called, consider it as a root.
- if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
- !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
- WorkList.emplace_back(SML, CG, FnCosts, &Fn);
- }
- }
- }
+ writeDOTGraph(SG);
- // Sort the worklist so the most expensive roots are seen first.
- sort(WorkList, [&](auto &A, auto &B) {
- // Sort by total cost, and if the total cost is identical, sort
- // alphabetically.
- if (A.TotalCost == B.TotalCost)
- return A.Fn->getName() < B.Fn->getName();
- return A.TotalCost > B.TotalCost;
- });
+ LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
- if (SML) {
- SML << "Worklist\n";
- for (const auto &FWD : WorkList) {
- SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
- << " indirect:" << FWD.HasIndirectCall
- << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
- << ")\n";
- // Sort function names before printing to ensure determinism.
- SmallVector<std::string> SortedDepNames;
- SortedDepNames.reserve(FWD.Dependencies.size());
- for (const auto *Dep : FWD.Dependencies)
- SortedDepNames.push_back(getName(*Dep));
- sort(SortedDepNames);
-
- for (const auto &Name : SortedDepNames)
- SML << " [dependency] " << Name << '\n';
- }
+ std::optional<SplitProposal> Proposal;
+ const auto EvaluateProposal = [&](SplitProposal SP) {
+ if (!Proposal)
+ Proposal = std::move(SP);
+ else
+ evaluateProposal(*Proposal, std::move(SP));
+ };
+
+ // TODO: It would be very easy to create new strategies by just adding a base
+ // class to RecursiveSearchSplitting and abstracting it away.
+ RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
+ LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
+ << Proposal->getName() << "\n";);
+
+ if (!Proposal) {
+ LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
+ ModuleCallback(cloneAll(M));
+ return;
}
- // This performs all of the partitioning work.
- auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
- assert(Partitions.size() == N);
+ LLVM_DEBUG(Proposal->print(dbgs()););
- // If we didn't externalize GVs, then local GVs need to be conservatively
- // imported into every module (including their initializers), and then cleaned
- // up afterwards.
- const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
- // We conservatively import private/internal GVs into every module and clean
- // them up afterwards.
- const auto *Var = dyn_cast<GlobalVariable>(GV);
- return Var && Var->hasLocalLinkage();
- };
+ std::optional<raw_fd_ostream> SummariesOS;
+ if (!PartitionSummariesOutput.empty()) {
+ std::error_code EC;
+ SummariesOS.emplace(PartitionSummariesOutput, EC);
+ if (EC)
+ errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
+ << "' - Partition summaries will not be printed\n";
+ }
+
+ for (unsigned PID = 0; PID < NumParts; ++PID) {
+ SplitModuleTimer SMT2("modules_creation",
+ "creating modules for each partition");
+ LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
- SML << "Creating " << N << " modules...\n";
- unsigned TotalFnImpls = 0;
- for (unsigned I = 0; I < N; ++I) {
- const auto &FnsInPart = Partitions[I];
+ DenseSet<const Function *> FnsInPart;
+ for (unsigned NodeID : (*Proposal)[PID].set_bits())
+ FnsInPart.insert(&SG.getNode(NodeID).getFunction());
ValueToValueMapTy VMap;
+ CostType PartCost = 0;
std::unique_ptr<Module> MPart(
CloneModule(M, VMap, [&](const GlobalValue *GV) {
// Functions go in their assigned partition.
- if (const auto *Fn = dyn_cast<Function>(GV))
- return FnsInPart.contains(Fn);
-
- if (NeedsConservativeImport(GV))
- return true;
+ if (const auto *Fn = dyn_cast<Function>(GV)) {
+ if (FnsInPart.contains(Fn)) {
+ PartCost += SG.getCost(*Fn);
+ return true;
+ }
+ return false;
+ }
// Everything else goes in the first partition.
- return I == 0;
+ return needsConservativeImport(GV) || PID == 0;
}));
+ // FIXME: Aliases aren't seen often, and their handling isn't perfect so
+ // bugs are possible.
+
// Clean-up conservatively imported GVs without any users.
- for (auto &GV : make_early_inc_range(MPart->globals())) {
- if (NeedsConservativeImport(&GV) && GV.use_empty())
+ for (auto &GV : make_early_inc_range(MPart->global_values())) {
+ if (needsConservativeImport(&GV) && GV.use_empty())
GV.eraseFromParent();
}
- unsigned NumAllFns = 0, NumKernels = 0;
- for (auto &Cur : *MPart) {
- if (!Cur.isDeclaration()) {
- ++NumAllFns;
- if (isEntryPoint(&Cur))
- ++NumKernels;
- }
- }
- TotalFnImpls += NumAllFns;
- SML << " - Module " << I << " with " << NumAllFns << " functions ("
- << NumKernels << " kernels)\n";
+ if (SummariesOS)
+ printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
+
+ LLVM_DEBUG(
+ printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
+
ModuleCallback(std::move(MPart));
}
-
- SML << TotalFnImpls << " function definitions across all modules ("
- << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
- << "% of original module)\n";
}
} // namespace
PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
ModuleAnalysisManager &MAM) {
+ SplitModuleTimer SMT(
+ "total", "total pass runtime (incl. potentially waiting for lockfile)");
+
FunctionAnalysisManager &FAM =
MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
return FAM.getResult<TargetIRAnalysis>(F);
};
- splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
- // We don't change the original module.
- return PreservedAnalyses::all();
+
+ bool Done = false;
+#ifndef NDEBUG
+ if (UseLockFile) {
+ SmallString<128> LockFilePath;
+ sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
+ sys::path::append(LockFilePath, "amdgpu-split-module-debug");
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
+ << "'\n");
+
+ while (true) {
+ llvm::LockFileManager Locked(LockFilePath.str());
+ switch (Locked) {
+ case LockFileManager::LFS_Error:
+ errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+ "output may be mangled by other processes\n";
+ Locked.unsafeRemoveLockFile();
+ break;
+ case LockFileManager::LFS_Owned:
+ break;
+ case LockFileManager::LFS_Shared: {
+ switch (Locked.waitForUnlock()) {
+ case LockFileManager::Res_Success:
+ break;
+ case LockFileManager::Res_OwnerDied:
+ continue; // try again to get the lock.
+ case LockFileManager::Res_Timeout:
+ errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+ "output may be mangled by other processes\n";
----------------
arsenm wrote:
It's not about the number of times, it's the compiler as a library should never spam to console ever
https://github.com/llvm/llvm-project/pull/104763
More information about the llvm-commits
mailing list