[llvm] [AMDGPU] Introduce "amdgpu-sw-lower-lds" pass to lower LDS accesses to use device global memory. (PR #87265)

Fri Apr 19 04:10:41 PDT 2024

================
@@ -65,6 +68,219 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
   return true;
 }
 
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
+  // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
+  // global may have uses from multiple different functions as a result.
+  // This pass specialises LDS variables with respect to the kernel that
+  // allocates them.
+
+  // This is semantically equivalent to (the unimplemented as slow):
+  // for (auto &F : M.functions())
+  //   for (auto &BB : F)
+  //     for (auto &I : BB)
+  //       for (Use &Op : I.operands())
+  //         if (constantExprUsesLDS(Op))
+  //           replaceConstantExprInFunction(I, Op);
+
+  SmallVector<Constant *> LDSGlobals;
+  for (auto &GV : M.globals())
+    if (AMDGPU::isLDSVariableToLower(GV))
+      LDSGlobals.push_back(&GV);
+  return convertUsersOfConstantsToInstructions(LDSGlobals);
+}
+
+void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
+                            FunctionVariableMap &kernels,
+                            FunctionVariableMap &functions) {
+  // Get uses from the current function, excluding uses by called functions
+  // Two output variables to avoid walking the globals list twice
+  for (auto &GV : M.globals()) {
+    if (!AMDGPU::isLDSVariableToLower(GV)) {
+      continue;
+    }
+    for (User *V : GV.users()) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        Function *F = I->getFunction();
+        if (isKernelLDS(F)) {
+          kernels[F].insert(&GV);
+        } else {
+          functions[F].insert(&GV);
+        }
+      }
+    }
+  }
+}
+
+bool isKernelLDS(const Function *F) {
+  // Some weirdness here. AMDGPU::isKernelCC does not call into
+  // AMDGPU::isKernel with the calling conv, it instead calls into
+  // isModuleEntryFunction which returns true for more calling conventions
+  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
+  // There's also a test that checks that the LDS lowering does not hit on
+  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
+  // Putting LDS in the name of the function to draw attention to this.
+  return AMDGPU::isKernel(F->getCallingConv());
+}
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
+
+  FunctionVariableMap direct_map_kernel;
+  FunctionVariableMap direct_map_function;
+  getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
+
+  // Collect variables that are used by functions whose address has escaped
+  DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+  for (Function &F : M.functions()) {
+    if (!isKernelLDS(&F))
+      if (F.hasAddressTaken(nullptr,
+                            /* IgnoreCallbackUses */ false,
+                            /* IgnoreAssumeLikeCalls */ false,
+                            /* IgnoreLLVMUsed */ true,
+                            /* IgnoreArcAttachedCall */ false)) {
+        set_union(VariablesReachableThroughFunctionPointer,
+                  direct_map_function[&F]);
+      }
+  }
+
+  auto functionMakesUnknownCall = [&](const Function *F) -> bool {
+    assert(!F->isDeclaration());
+    for (const CallGraphNode::CallRecord &R : *CG[F]) {
+      if (!R.second->getFunction()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // Work out which variables are reachable through function calls
+  FunctionVariableMap transitive_map_function = direct_map_function;
+
+  // If the function makes any unknown call, assume the worst case that it can
+  // access all variables accessed by functions whose address escaped
+  for (Function &F : M.functions()) {
+    if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
+      if (!isKernelLDS(&F)) {
+        set_union(transitive_map_function[&F],
+                  VariablesReachableThroughFunctionPointer);
+      }
+    }
+  }
+
+  // Direct implementation of collecting all variables reachable from each
+  // function
+  for (Function &Func : M.functions()) {
+    if (Func.isDeclaration() || isKernelLDS(&Func))
+      continue;
+
+    DenseSet<Function *> seen; // catches cycles
+    SmallVector<Function *, 4> wip{&Func};
+
+    while (!wip.empty()) {
+      Function *F = wip.pop_back_val();
+
+      // Can accelerate this by referring to transitive map for functions that
+      // have already been computed, with more care than this
+      set_union(transitive_map_function[&Func], direct_map_function[F]);
+
+      for (const CallGraphNode::CallRecord &R : *CG[F]) {
+        Function *ith = R.second->getFunction();
----------------
skc7 wrote:

Updated pre-requisite PR #88002 which has AMDGPUmemoryUtils changes. Changes here will be removed once #88002 gets merged.

https://github.com/llvm/llvm-project/pull/87265