[llvm] 2c5f470 - [AMDGPU] Move LDS utilities from amdgpu-lower-module-lds pass to AMDGPUMemoryUtils (#88002)

Thu May 9 22:19:53 PDT 2024

Author: Chaitanya
Date: 2024-05-10T10:49:48+05:30
New Revision: 2c5f470da6ab313f7c6a1aa53fb40dbcbde338f1

URL: https://github.com/llvm/llvm-project/commit/2c5f470da6ab313f7c6a1aa53fb40dbcbde338f1
DIFF: https://github.com/llvm/llvm-project/commit/2c5f470da6ab313f7c6a1aa53fb40dbcbde338f1.diff

LOG: [AMDGPU] Move LDS utilities from amdgpu-lower-module-lds pass to AMDGPUMemoryUtils (#88002)

This moves some of the utility methods from amdgpu-lower-module-lds pass to AMDGPUMemoryUtils.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index c8bf9dd39e389..2c7163a775372 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -212,6 +212,7 @@
 #define DEBUG_TYPE "amdgpu-lower-module-lds"
 
 using namespace llvm;
+using namespace AMDGPU;
 
 namespace {
 
@@ -234,17 +235,6 @@ cl::opt<LoweringKind> LoweringKindLoc(
         clEnumValN(LoweringKind::hybrid, "hybrid",
                    "Lower via mixture of above strategies")));
 
-bool isKernelLDS(const Function *F) {
-  // Some weirdness here. AMDGPU::isKernelCC does not call into
-  // AMDGPU::isKernel with the calling conv, it instead calls into
-  // isModuleEntryFunction which returns true for more calling conventions
-  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
-  // There's also a test that checks that the LDS lowering does not hit on
-  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
-  // Putting LDS in the name of the function to draw attention to this.
-  return AMDGPU::isKernel(F->getCallingConv());
-}
-
 template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
   llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
     return L->getName() < R->getName();
@@ -305,183 +295,9 @@ class AMDGPULowerModuleLDS {
         Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
   }
 
-  static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
-    // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
-    // global may have uses from multiple 
diff erent functions as a result.
-    // This pass specialises LDS variables with respect to the kernel that
-    // allocates them.
-
-    // This is semantically equivalent to (the unimplemented as slow):
-    // for (auto &F : M.functions())
-    //   for (auto &BB : F)
-    //     for (auto &I : BB)
-    //       for (Use &Op : I.operands())
-    //         if (constantExprUsesLDS(Op))
-    //           replaceConstantExprInFunction(I, Op);
-
-    SmallVector<Constant *> LDSGlobals;
-    for (auto &GV : M.globals())
-      if (AMDGPU::isLDSVariableToLower(GV))
-        LDSGlobals.push_back(&GV);
-
-    return convertUsersOfConstantsToInstructions(LDSGlobals);
-  }
-
 public:
   AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
 
-  using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
-
-  using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
-
-  static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
-                                     FunctionVariableMap &kernels,
-                                     FunctionVariableMap &functions) {
-
-    // Get uses from the current function, excluding uses by called functions
-    // Two output variables to avoid walking the globals list twice
-    for (auto &GV : M.globals()) {
-      if (!AMDGPU::isLDSVariableToLower(GV)) {
-        continue;
-      }
-
-      for (User *V : GV.users()) {
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          Function *F = I->getFunction();
-          if (isKernelLDS(F)) {
-            kernels[F].insert(&GV);
-          } else {
-            functions[F].insert(&GV);
-          }
-        }
-      }
-    }
-  }
-
-  struct LDSUsesInfoTy {
-    FunctionVariableMap direct_access;
-    FunctionVariableMap indirect_access;
-  };
-
-  static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
-
-    FunctionVariableMap direct_map_kernel;
-    FunctionVariableMap direct_map_function;
-    getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
-
-    // Collect variables that are used by functions whose address has escaped
-    DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
-    for (Function &F : M.functions()) {
-      if (!isKernelLDS(&F))
-        if (F.hasAddressTaken(nullptr,
-                              /* IgnoreCallbackUses */ false,
-                              /* IgnoreAssumeLikeCalls */ false,
-                              /* IgnoreLLVMUsed */ true,
-                              /* IgnoreArcAttachedCall */ false)) {
-          set_union(VariablesReachableThroughFunctionPointer,
-                    direct_map_function[&F]);
-        }
-    }
-
-    auto functionMakesUnknownCall = [&](const Function *F) -> bool {
-      assert(!F->isDeclaration());
-      for (const CallGraphNode::CallRecord &R : *CG[F]) {
-        if (!R.second->getFunction()) {
-          return true;
-        }
-      }
-      return false;
-    };
-
-    // Work out which variables are reachable through function calls
-    FunctionVariableMap transitive_map_function = direct_map_function;
-
-    // If the function makes any unknown call, assume the worst case that it can
-    // access all variables accessed by functions whose address escaped
-    for (Function &F : M.functions()) {
-      if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
-        if (!isKernelLDS(&F)) {
-          set_union(transitive_map_function[&F],
-                    VariablesReachableThroughFunctionPointer);
-        }
-      }
-    }
-
-    // Direct implementation of collecting all variables reachable from each
-    // function
-    for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || isKernelLDS(&Func))
-        continue;
-
-      DenseSet<Function *> seen; // catches cycles
-      SmallVector<Function *, 4> wip{&Func};
-
-      while (!wip.empty()) {
-        Function *F = wip.pop_back_val();
-
-        // Can accelerate this by referring to transitive map for functions that
-        // have already been computed, with more care than this
-        set_union(transitive_map_function[&Func], direct_map_function[F]);
-
-        for (const CallGraphNode::CallRecord &R : *CG[F]) {
-          Function *ith = R.second->getFunction();
-          if (ith) {
-            if (!seen.contains(ith)) {
-              seen.insert(ith);
-              wip.push_back(ith);
-            }
-          }
-        }
-      }
-    }
-
-    // direct_map_kernel lists which variables are used by the kernel
-    // find the variables which are used through a function call
-    FunctionVariableMap indirect_map_kernel;
-
-    for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || !isKernelLDS(&Func))
-        continue;
-
-      for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
-        Function *ith = R.second->getFunction();
-        if (ith) {
-          set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
-        } else {
-          set_union(indirect_map_kernel[&Func],
-                    VariablesReachableThroughFunctionPointer);
-        }
-      }
-    }
-
-    // Verify that we fall into one of 2 cases:
-    //    - All variables are absolute: this is a re-run of the pass
-    //      so we don't have anything to do.
-    //    - No variables are absolute.
-    std::optional<bool> HasAbsoluteGVs;
-    for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
-      for (auto &[Fn, GVs] : Map) {
-        for (auto *GV : GVs) {
-          bool IsAbsolute = GV->isAbsoluteSymbolRef();
-          if (HasAbsoluteGVs.has_value()) {
-            if (*HasAbsoluteGVs != IsAbsolute) {
-              report_fatal_error(
-                  "Module cannot mix absolute and non-absolute LDS GVs");
-            }
-          } else
-            HasAbsoluteGVs = IsAbsolute;
-        }
-      }
-    }
-
-    // If we only had absolute GVs, we have nothing to do, return an empty
-    // result.
-    if (HasAbsoluteGVs && *HasAbsoluteGVs)
-      return {FunctionVariableMap(), FunctionVariableMap()};
-
-    return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
-  }
-
   struct LDSVariableReplacement {
     GlobalVariable *SGV = nullptr;
     DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 79c359a575545..239e0ee705729 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -9,13 +9,16 @@
 #include "AMDGPUMemoryUtils.h"
 #include "AMDGPU.h"
 #include "AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/ReplaceConstant.h"
 
 #define DEBUG_TYPE "amdgpu-memory-utils"
@@ -26,7 +29,7 @@ namespace llvm {
 
 namespace AMDGPU {
 
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
   return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
                                        GV->getValueType());
 }
@@ -61,6 +64,216 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
   return true;
 }
 
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
+  // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
+  // global may have uses from multiple 
diff erent functions as a result.
+  // This pass specialises LDS variables with respect to the kernel that
+  // allocates them.
+
+  // This is semantically equivalent to (the unimplemented as slow):
+  // for (auto &F : M.functions())
+  //   for (auto &BB : F)
+  //     for (auto &I : BB)
+  //       for (Use &Op : I.operands())
+  //         if (constantExprUsesLDS(Op))
+  //           replaceConstantExprInFunction(I, Op);
+
+  SmallVector<Constant *> LDSGlobals;
+  for (auto &GV : M.globals())
+    if (AMDGPU::isLDSVariableToLower(GV))
+      LDSGlobals.push_back(&GV);
+  return convertUsersOfConstantsToInstructions(LDSGlobals);
+}
+
+void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
+                            FunctionVariableMap &kernels,
+                            FunctionVariableMap &Functions) {
+  // Get uses from the current function, excluding uses by called Functions
+  // Two output variables to avoid walking the globals list twice
+  for (auto &GV : M.globals()) {
+    if (!AMDGPU::isLDSVariableToLower(GV))
+      continue;
+    for (User *V : GV.users()) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        Function *F = I->getFunction();
+        if (isKernelLDS(F))
+          kernels[F].insert(&GV);
+        else
+          Functions[F].insert(&GV);
+      }
+    }
+  }
+}
+
+bool isKernelLDS(const Function *F) {
+  // Some weirdness here. AMDGPU::isKernelCC does not call into
+  // AMDGPU::isKernel with the calling conv, it instead calls into
+  // isModuleEntryFunction which returns true for more calling conventions
+  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
+  // There's also a test that checks that the LDS lowering does not hit on
+  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
+  // Putting LDS in the name of the function to draw attention to this.
+  return AMDGPU::isKernel(F->getCallingConv());
+}
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
+
+  FunctionVariableMap DirectMapKernel;
+  FunctionVariableMap DirectMapFunction;
+  getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
+
+  // Collect variables that are used by functions whose address has escaped
+  DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+  for (Function &F : M.functions()) {
+    if (!isKernelLDS(&F))
+      if (F.hasAddressTaken(nullptr,
+                            /* IgnoreCallbackUses */ false,
+                            /* IgnoreAssumeLikeCalls */ false,
+                            /* IgnoreLLVMUsed */ true,
+                            /* IgnoreArcAttachedCall */ false)) {
+        set_union(VariablesReachableThroughFunctionPointer,
+                  DirectMapFunction[&F]);
+      }
+  }
+
+  auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
+    assert(!F->isDeclaration());
+    for (const CallGraphNode::CallRecord &R : *CG[F]) {
+      if (!R.second->getFunction())
+        return true;
+    }
+    return false;
+  };
+
+  // Work out which variables are reachable through function calls
+  FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
+
+  // If the function makes any unknown call, assume the worst case that it can
+  // access all variables accessed by functions whose address escaped
+  for (Function &F : M.functions()) {
+    if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
+      if (!isKernelLDS(&F)) {
+        set_union(TransitiveMapFunction[&F],
+                  VariablesReachableThroughFunctionPointer);
+      }
+    }
+  }
+
+  // Direct implementation of collecting all variables reachable from each
+  // function
+  for (Function &Func : M.functions()) {
+    if (Func.isDeclaration() || isKernelLDS(&Func))
+      continue;
+
+    DenseSet<Function *> seen; // catches cycles
+    SmallVector<Function *, 4> wip = {&Func};
+
+    while (!wip.empty()) {
+      Function *F = wip.pop_back_val();
+
+      // Can accelerate this by referring to transitive map for functions that
+      // have already been computed, with more care than this
+      set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
+
+      for (const CallGraphNode::CallRecord &R : *CG[F]) {
+        Function *Ith = R.second->getFunction();
+        if (Ith) {
+          if (!seen.contains(Ith)) {
+            seen.insert(Ith);
+            wip.push_back(Ith);
+          }
+        }
+      }
+    }
+  }
+
+  // DirectMapKernel lists which variables are used by the kernel
+  // find the variables which are used through a function call
+  FunctionVariableMap IndirectMapKernel;
+
+  for (Function &Func : M.functions()) {
+    if (Func.isDeclaration() || !isKernelLDS(&Func))
+      continue;
+
+    for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
+      Function *Ith = R.second->getFunction();
+      if (Ith) {
+        set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
+      } else {
+        set_union(IndirectMapKernel[&Func],
+                  VariablesReachableThroughFunctionPointer);
+      }
+    }
+  }
+
+  // Verify that we fall into one of 2 cases:
+  //    - All variables are absolute: this is a re-run of the pass
+  //      so we don't have anything to do.
+  //    - No variables are absolute.
+  std::optional<bool> HasAbsoluteGVs;
+  for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
+    for (auto &[Fn, GVs] : Map) {
+      for (auto *GV : GVs) {
+        bool IsAbsolute = GV->isAbsoluteSymbolRef();
+        if (HasAbsoluteGVs.has_value()) {
+          if (*HasAbsoluteGVs != IsAbsolute) {
+            report_fatal_error(
+                "Module cannot mix absolute and non-absolute LDS GVs");
+          }
+        } else
+          HasAbsoluteGVs = IsAbsolute;
+      }
+    }
+  }
+
+  // If we only had absolute GVs, we have nothing to do, return an empty
+  // result.
+  if (HasAbsoluteGVs && *HasAbsoluteGVs)
+    return {FunctionVariableMap(), FunctionVariableMap()};
+
+  return {std::move(DirectMapKernel), std::move(IndirectMapKernel)};
+}
+
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+                               StringRef FnAttr) {
+  KernelRoot->removeFnAttr(FnAttr);
+
+  SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
+  SmallPtrSet<Function *, 8> Visited;
+  bool SeenUnknownCall = false;
+
+  while (!WorkList.empty()) {
+    Function *F = WorkList.pop_back_val();
+
+    for (auto &CallRecord : *CG[F]) {
+      if (!CallRecord.second)
+        continue;
+
+      Function *Callee = CallRecord.second->getFunction();
+      if (!Callee) {
+        if (!SeenUnknownCall) {
+          SeenUnknownCall = true;
+
+          // If we see any indirect calls, assume nothing about potential
+          // targets.
+          // TODO: This could be refined to possible LDS global users.
+          for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
+            Function *PotentialCallee =
+                ExternalCallRecord.second->getFunction();
+            assert(PotentialCallee);
+            if (!isKernelLDS(PotentialCallee))
+              PotentialCallee->removeFnAttr(FnAttr);
+          }
+        }
+      } else {
+        Callee->removeFnAttr(FnAttr);
+        if (Visited.insert(Callee).second)
+          WorkList.push_back(Callee);
+      }
+    }
+  }
+}
+
 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
   Instruction *DefInst = Def->getMemoryInst();
 

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index e42b27f8e09e1..4d3ad328e1310 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
 namespace llvm {
 
 struct Align;
@@ -19,14 +22,40 @@ class LoadInst;
 class MemoryDef;
 class MemorySSA;
 class Value;
+class Function;
+class CallGraph;
+class Module;
 
 namespace AMDGPU {
 
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
+using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
+
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV);
 
 bool isDynamicLDS(const GlobalVariable &GV);
 bool isLDSVariableToLower(const GlobalVariable &GV);
 
+struct LDSUsesInfoTy {
+  FunctionVariableMap direct_access;
+  FunctionVariableMap indirect_access;
+};
+
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M);
+
+void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
+                            FunctionVariableMap &kernels,
+                            FunctionVariableMap &functions);
+
+bool isKernelLDS(const Function *F);
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
+
+/// Strip FnAttr attribute from any functions where we may have
+/// introduced its use.
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+                               StringRef FnAttr);
+
 /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
 /// if this is actually a memory update or an artificial clobber to facilitate
 /// ordering constraints.