[llvm] [AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS (PR #89683)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 24 01:47:49 PDT 2024
================
@@ -0,0 +1,139 @@
+//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to ensure that the combined module contains
+// as many LDS global variables as there are kernels that (indirectly) access
+// them. As LDS variables behave like C++ static variables, it is important that
+// each partition contains a unique copy of the variable on a per kernel basis.
+// This representation also prepares the combined module to eliminate
+// cross-module dependencies of LDS variables.
+//
+// This pass operates as follows:
+// 1. Firstly, traverse the call graph from each kernel to determine the number
+// of kernels calling each device function.
+// 2. For each LDS global variable GV, determine the function F that defines it.
+// Collect it's caller functions. Clone F and GV, and finally insert a
+// call/invoke instruction in each caller function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-clone-module-lds"
+
+static cl::opt<unsigned int> MaxCountForClonedFunctions(
+ "clone-lds-functions-max-count", cl::init(16), cl::Hidden,
+ cl::desc("Specify a limit to the number of clones of a function"));
+
+/// Return the function that defines \p GV
+/// \param GV The global variable in question
+/// \return The function defining \p GV
+static Function *getFunctionDefiningGV(GlobalVariable &GV) {
+ SmallVector<User *> Worklist(GV.users());
+ while (!Worklist.empty()) {
+ User *U = Worklist.pop_back_val();
+ if (auto *Inst = dyn_cast<Instruction>(U))
+ return Inst->getFunction();
+ if (auto *Op = dyn_cast<Operator>(U))
+ append_range(Worklist, Op->users());
+ }
+ return nullptr;
+};
+
+PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (MaxCountForClonedFunctions.getValue() == 1)
+ return PreservedAnalyses::all();
+
+ bool Changed = false;
+ auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+ // For each function in the call graph, determine the number
+ // of ancestor-caller kernels.
+ DenseMap<Function *, unsigned int> KernelRefsToFuncs;
+ for (auto &Fn : M) {
+ if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ continue;
+ for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I)
+ if (auto *F = I->getFunction())
+ KernelRefsToFuncs[F]++;
+ }
+
+ DenseMap<GlobalVariable *, Function *> GVToFnMap;
+ for (auto &GV : M.globals()) {
+ if (GVToFnMap.contains(&GV) ||
+ GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+ !GV.hasInitializer())
+ continue;
+
+ auto *OldF = getFunctionDefiningGV(GV);
+ GVToFnMap.insert({&GV, OldF});
+ LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
+ << OldF->getName() << '\n');
+
+ // Collect all call instructions to OldF
+ SmallVector<Instruction *> InstsCallingOldF;
+ for (auto &I : OldF->uses())
+ if (auto *CI = dyn_cast<CallBase>(I.getUser()))
+ InstsCallingOldF.push_back(CI);
----------------
Pierre-vh wrote:
```suggestion
for (auto &I : OldF->uses()) {
if (auto *CI = dyn_cast<CallBase>(I.getUser()))
InstsCallingOldF.push_back(CI);
}
```
https://github.com/llvm/llvm-project/pull/89683
More information about the llvm-commits
mailing list