[llvm] [AMDGPU] Introduce "amdgpu-sw-lower-lds" pass to lower LDS accesses to use device global memory. (PR #87265)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu May 9 08:40:45 PDT 2024


================
@@ -0,0 +1,876 @@
+//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the local data store, LDS, uses in kernel and non-kernel
+// functions in module with dynamically allocated device global memory.
+//
+// Replacement of Kernel LDS accesses:
+//    For a kernel, LDS access can be static or dynamic which are direct
+//    (accessed within kernel) and indirect (accessed through non-kernels).
+//    A device global memory equal to size of all these LDS globals will be
+//    allocated. At the prologue of the kernel, a single work-item from the
+//    work-group, does a "malloc" and stores the pointer of the allocation in
+//    new LDS global that will be created for the kernel. This will be called
+//    "SW LDS" in this pass.
+//    Each LDS access corresponds to an offset in the allocated memory.
+//    All static LDS accesses will be allocated first and then dynamic LDS
+//    will occupy the device global memory.
+//    To store the offsets corresponding to all LDS accesses, another global
+//    variable is created which will be called "SW LDS metadata" in this pass.
+//    - SW LDS Global:
+//        It is LDS global of ptr type with name
+//        "llvm.amdgcn.sw.lds.<kernel-name>".
+//    - Metadata Global:
+//        It is of struct type, with n members. n equals the number of LDS
+//        globals accessed by the kernel(direct and indirect). Each member of
+//        struct is another struct of type {i32, i32, i32}. First member
+//        corresponds to offset, second member corresponds to size of LDS global
+//        being replaced and third represents the total aligned size. It will
+//        have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
+//        an intializer with static LDS related offsets and sizes initialized.
+//        But for dynamic LDS related entries, offsets will be intialized to
+//        previous static LDS allocation end offset. Sizes for them will be zero
+//        initially. These dynamic LDS offset and size values will be updated
+//        with in the kernel, since kernel can read the dynamic LDS size
+//        allocation done at runtime with query to "hidden_dynamic_lds_size"
+//        hidden kernel argument.
+//
+//    LDS accesses within the kernel will be replaced by "gep" ptr to
+//    corresponding offset into allocated device global memory for the kernel.
+//    At the epilogue of kernel, allocated memory would be made free by the same
+//    single work-item.
+//
+// Replacement of non-kernel LDS accesses:
+//    Multiple kernels can access the same non-kernel function.
+//    All the kernels accessing LDS through non-kernels are sorted and
+//    assigned a kernel-id. All the LDS globals accessed by non-kernels
+//    are sorted. This information is used to build two tables:
+//    - Base table:
+//        Base table will have single row, with elements of the row
+//        placed as per kernel ID. Each element in the row corresponds
+//        to addresss of "SW LDS" variable created for
+//        that kernel.
+//    - Offset table:
+//        Offset table will have multiple rows and columns.
+//        Rows are assumed to be from 0 to (n-1). n is total number
+//        of kernels accessing the LDS through non-kernels.
+//        Each row will have m elements. m is the total number of
+//        unique LDS globals accessed by all non-kernels.
+//        Each element in the row correspond to the address of
+//        the replacement of LDS global done by that particular kernel.
+//    A LDS variable in non-kernel will be replaced based on the information
+//    from base and offset tables. Based on kernel-id query, address of "SW
+//    LDS" for that corresponding kernel is obtained from base table.
+//    The Offset into the base "SW LDS" is obtained from
+//    corresponding element in offset table. With this information, replacement
+//    value is obtained.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-sw-lower-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
+
+struct LDSAccessTypeInfo {
+  SetVector<GlobalVariable *> StaticLDSGlobals;
+  SetVector<GlobalVariable *> DynamicLDSGlobals;
+};
+
+// Struct to hold all the Metadata required for a kernel
+// to replace a LDS global uses with corresponding offset
+// in to device global memory.
+struct KernelLDSParameters {
+  GlobalVariable *SwLDS = nullptr;
+  GlobalVariable *SwLDSMetadata = nullptr;
+  LDSAccessTypeInfo DirectAccess;
+  LDSAccessTypeInfo IndirectAccess;
+  DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
+      LDSToReplacementIndicesMap;
+};
+
+// Struct to store infor for creation of offset table
+// for all the non-kernel LDS accesses.
+struct NonKernelLDSParameters {
+  GlobalVariable *LDSBaseTable{nullptr};
+  GlobalVariable *LDSOffsetTable{nullptr};
+  SetVector<Function *> OrderedKernels;
+  SetVector<GlobalVariable *> OrdereLDSGlobals;
+};
+
+class AMDGPUSwLowerLDS {
+public:
+  AMDGPUSwLowerLDS(Module &Mod, DomTreeCallback Callback)
+      : M(Mod), IRB(M.getContext()), DTCallback(Callback) {}
+  bool run();
+  void getUsesOfLDSByNonKernels(const CallGraph &CG,
+                                FunctionVariableMap &Functions);
+  SetVector<Function *>
+  getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
+  SetVector<GlobalVariable *>
+  getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &&Variables);
+  void populateSwLDSGlobal(Function *Func);
+  void populateSwMetadataGlobal(Function *Func);
+  void populateLDSToReplacementIndicesMap(Function *Func);
+  void replaceKernelLDSAccesses(Function *Func);
+  void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
+  void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
+  void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
+  Constant *
+  getAddressesOfVariablesInKernel(Function *Func,
+                                  SetVector<GlobalVariable *> &Variables);
+  void lowerNonKernelLDSAccesses(Function *Func,
+                                 SetVector<GlobalVariable *> &LDSGlobals,
+                                 NonKernelLDSParameters &NKLDSParams);
+
+private:
+  Module &M;
+  IRBuilder<> IRB;
+  DomTreeCallback DTCallback;
+  DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
+};
+
+template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
+  // Sort the vector of globals or Functions based on their name.
+  // Returns a SetVector of globals/Functions.
+  sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {SetVector<T>(V.begin(), V.end())};
+}
+
+SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
+    SetVector<GlobalVariable *> &&Variables) {
+  // Sort all the non-kernel LDS accesses based on their name.
+  return sortByName(
+      std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
+}
+
+SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
+    SetVector<Function *> &&Kernels) {
+  // Sort the non-kernels accessing LDS based on their name.
+  // Also assign a kernel ID metadata based on the sorted order.
+  LLVMContext &Ctx = M.getContext();
+  if (Kernels.size() > UINT32_MAX) {
+    // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+    report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
+  }
+  SetVector<Function *> OrderedKernels =
+      sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
+  for (size_t i = 0; i < Kernels.size(); i++) {
+    Metadata *AttrMDArgs[1] = {
+        ConstantAsMetadata::get(IRB.getInt32(i)),
+    };
+    Function *Func = OrderedKernels[i];
+    Func->setMetadata("llvm.amdgcn.lds.kernel.id",
+                      MDNode::get(Ctx, AttrMDArgs));
+    auto &LDSParams = KernelToLDSParametersMap[Func];
+  }
+  return std::move(OrderedKernels);
+}
+
+void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels(
+    const CallGraph &CG, FunctionVariableMap &functions) {
+  // Get uses from the current function, excluding uses by called functions
+  // Two output variables to avoid walking the globals list twice
+  for (auto &GV : M.globals()) {
+    if (!AMDGPU::isLDSVariableToLower(GV))
+      continue;
+
+    if (GV.isAbsoluteSymbolRef()) {
+      report_fatal_error(
+          "LDS variables with absolute addresses are unimplemented.");
+    }
+
+    for (User *V : GV.users()) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        Function *F = I->getFunction();
+        if (!isKernelLDS(F))
+          functions[F].insert(&GV);
+      }
+    }
+  }
+}
+
+void AMDGPUSwLowerLDS::populateSwLDSGlobal(Function *Func) {
+  // Create new LDS global required for each kernel to store
+  // device global memory pointer.
+  auto &LDSParams = KernelToLDSParametersMap[Func];
+  // create new global pointer variable
+  LDSParams.SwLDS = new GlobalVariable(
+      M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
+      PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
+      nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+  return;
+}
+
+void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
+  // Create new metadata global for every kernel and initialize the
+  // start offsets and sizes corresponding to each LDS accesses.
+  auto &LDSParams = KernelToLDSParametersMap[Func];
+  auto &Ctx = M.getContext();
+  auto &DL = M.getDataLayout();
+  std::vector<Type *> Items;
+  Type *Int32Ty = IRB.getInt32Ty();
+  std::vector<Constant *> Initializers;
+  Align MaxAlignment(1);
+  auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
+    Align GVAlign = AMDGPU::getAlign(DL, GV);
+    MaxAlignment = std::max(MaxAlignment, GVAlign);
+  };
+
+  for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
+    UpdateMaxAlignment(GV);
+
+  for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
+    UpdateMaxAlignment(GV);
+
+  for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
+    UpdateMaxAlignment(GV);
+
+  for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
+    UpdateMaxAlignment(GV);
+
+  //{StartOffset, AlignedSizeInBytes}
+  SmallString<128> MDItemStr;
+  raw_svector_ostream MDItemOS(MDItemStr);
+  MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md.item";
+
+  StructType *LDSItemTy =
+      StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
+  uint32_t MallocSize = 0;
+  auto buildInitializerForSwLDSMD =
+      [&](SetVector<GlobalVariable *> &LDSGlobals) {
+        for (auto &GV : LDSGlobals) {
+          Type *Ty = GV->getValueType();
+          const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+          Items.push_back(LDSItemTy);
+          Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
+          Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
+          uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
+          Constant *AlignedSizeInBytesConst =
+              ConstantInt::get(Int32Ty, AlignedSize);
+          MallocSize += AlignedSize;
+          Constant *InitItem =
+              ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
+                                              AlignedSizeInBytesConst});
+          Initializers.push_back(InitItem);
+        }
+      };
+
+  buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
+  buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
+  buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
+  buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
+
+  SmallString<128> MDTypeStr;
+  raw_svector_ostream MDTypeOS(MDTypeStr);
+  MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md.type";
+
+  StructType *MetadataStructType =
+      StructType::create(Ctx, Items, MDTypeOS.str());
+  SmallString<128> MDStr;
+  raw_svector_ostream MDOS(MDStr);
+  MDOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md";
+  LDSParams.SwLDSMetadata = new GlobalVariable(
+      M, MetadataStructType, false, GlobalValue::InternalLinkage,
+      PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
+      GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
+  Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
+  LDSParams.SwLDSMetadata->setInitializer(data);
+  assert(LDSParams.SwLDS);
+  // Set the alignment to MaxAlignment for SwLDS.
+  LDSParams.SwLDS->setAlignment(MaxAlignment);
+  GlobalValue::SanitizerMetadata MD;
+  MD.NoAddress = true;
+  LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
+  return;
+}
+
+void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
+  // Fill the corresponding LDS replacement indices for each LDS access
+  // related to this kernel.
+  auto &LDSParams = KernelToLDSParametersMap[Func];
+  auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
+                             uint32_t &Idx) {
+    for (auto &GV : LDSGlobals) {
+      LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
+      ++Idx;
+    }
+  };
+  uint32_t Idx = 0;
+  PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
+  PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
+  PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
+  PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
+  return;
+}
+
+static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
+                                           Value *Replacement) {
+  // Replace all uses of LDS global in this Function with a Replacement.
+  auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
+    auto *V = U.getUser();
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      auto *Func1 = Inst->getParent()->getParent();
+      if (Func == Func1)
+        return true;
+    }
+    return false;
+  };
+  GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
+}
+
+void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
+  auto &LDSParams = KernelToLDSParametersMap[Func];
+  GlobalVariable *SwLDS = LDSParams.SwLDS;
+  assert(SwLDS);
+  GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+  assert(SwLDSMetadata);
+  StructType *SwLDSMetadataStructType =
+      cast<StructType>(SwLDSMetadata->getValueType());
+  Type *Int32Ty = IRB.getInt32Ty();
+
+  auto &IndirectAccess = LDSParams.IndirectAccess;
+  auto &DirectAccess = LDSParams.DirectAccess;
+  // Replace all uses of LDS global in this Function with a Replacement.
+  auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+    for (auto &GV : LDSGlobals) {
+      // Do not generate instructions if LDS access is in non-kernel
+      // i.e indirect-access.
+      if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
+           IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
+          (!DirectAccess.StaticLDSGlobals.contains(GV) &&
+           !DirectAccess.DynamicLDSGlobals.contains(GV)))
+        continue;
+      auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
+      assert(Indices.size() == 3);
+      uint32_t Idx0 = Indices[0];
+      uint32_t Idx1 = Indices[1];
+      uint32_t Idx2 = Indices[2];
+      Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Idx0),
+                            ConstantInt::get(Int32Ty, Idx1),
+                            ConstantInt::get(Int32Ty, Idx2)};
+      Constant *GEP = ConstantExpr::getGetElementPtr(
+          SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
+      Value *Load = IRB.CreateLoad(Int32Ty, GEP);
+      Value *BasePlusOffset =
+          IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Load});
+      LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replacing LDS "
+                        << GV->getName().str());
----------------
arsenm wrote:

```suggestion
                        << GV->getName());
```

https://github.com/llvm/llvm-project/pull/87265


More information about the llvm-commits mailing list