[llvm] [AMDGPU] Introduce "amdgpu-sw-lower-lds" pass to lower LDS accesses to use device global memory. (PR #87265)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 18 03:59:01 PDT 2024


================
@@ -0,0 +1,877 @@
+//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the local data store, LDS, uses in kernel and non-kernel
+// functions in module with dynamically allocated device global memory.
+//
+// Replacement of Kernel LDS accesses:
+//    For a kernel, LDS access can be static or dynamic which are direct
+//    (accessed within kernel) and indirect (accessed through non-kernels).
+//    A device global memory equal to size of all these LDS globals will be
+//    allocated. At the prologue of the kernel, a single work-item from the
+//    work-group, does a "malloc" and stores the pointer of the allocation in
+//    new LDS global that will be created for the kernel. This will be called
+//    "SW LDS" in this pass.
+//    Each LDS access corresponds to an offset in the allocated memory.
+//    All static LDS accesses will be allocated first and then dynamic LDS
+//    will occupy the device global memory.
+//    To store the offsets corresponding to all LDS accesses, another global
+//    variable is created which will be called "SW LDS metadata" in this pass.
+//    - SW LDS Global:
+//        It is LDS global of ptr type with name
+//        "llvm.amdgcn.sw.lds.<kernel-name>".
+//    - Metadata Global:
+//        It is of struct type, with n members. n equals the number of LDS
+//        globals accessed by the kernel(direct and indirect). Each member of
+//        struct is another struct of type {i32, i32}. First member corresponds
+//        to offset, second member corresponds to aligned size of LDS global
+//        being replaced. It will have name
+//        "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have an
+//        intializer with static LDS related offsets and sizes initialized. But
+//        for dynamic LDS related entries, offsets will be intialized to
+//        previous static LDS allocation end offset. Sizes for them will be zero
+//        initially. These dynamic LDS offset and size values will be updated
+//        with in the kernel, since kernel can read the dynamic LDS size
+//        allocation done at runtime with query to "hidden_dynamic_lds_size"
+//        hidden kernel argument.
+//
+//    LDS accesses within the kernel will be replaced by "gep" ptr to
+//    corresponding offset into allocated device global memory for the kernel.
+//    At the epilogue of kernel, allocated memory would be made free by the same
+//    single work-item.
+//
+// Replacement of non-kernel LDS accesses:
+//    Multiple kernels can access the same non-kernel function.
+//    All the kernels accessing LDS through non-kernels are sorted and
+//    assigned a kernel-id. All the LDS globals accessed by non-kernels
+//    are sorted. This information is used to build two tables:
+//    - Base table:
+//        Base table will have single row, with elements of the row
+//        placed as per kernel ID. Each element in the row corresponds
+//        to addresss of "SW LDS" variable created for
+//        that kernel.
+//    - Offset table:
+//        Offset table will have multiple rows and columns.
+//        Rows are assumed to be from 0 to (n-1). n is total number
+//        of kernels accessing the LDS through non-kernels.
+//        Each row will have m elements. m is the total number of
+//        unique LDS globals accessed by all non-kernels.
+//        Each element in the row correspond to the address of
+//        the replacement of LDS global done by that particular kernel.
+//    A LDS variable in non-kernel will be replaced based on the information
+//    from base and offset tables. Based on kernel-id query, address of "SW
+//    LDS" for that corresponding kernel is obtained from base table.
+//    The Offset into the base "SW LDS" is obtained from
+//    corresponding element in offset table. With this information, replacement
+//    value is obtained.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-sw-lower-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
+
+struct LDSAccessTypeInfo {
+  SetVector<GlobalVariable *> StaticLDSGlobals;
+  SetVector<GlobalVariable *> DynamicLDSGlobals;
+};
+
+// Struct to hold all the Metadata required for a kernel
+// to replace a LDS global uses with corresponding offset
+// in to device global memory.
+struct KernelLDSParameters {
+  GlobalVariable *SwLDS{nullptr};
+  GlobalVariable *SwLDSMetadata{nullptr};
+  LDSAccessTypeInfo DirectAccess;
+  LDSAccessTypeInfo IndirectAccess;
+  DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
+      LDSToReplacementIndicesMap;
+  int32_t KernelId{-1};
+  uint32_t MallocSize{0};
+};
+
+// Struct to store infor for creation of offset table
+// for all the non-kernel LDS accesses.
+struct NonKernelLDSParameters {
+  GlobalVariable *LDSBaseTable{nullptr};
+  GlobalVariable *LDSOffsetTable{nullptr};
+  SetVector<Function *> OrderedKernels;
+  SetVector<GlobalVariable *> OrdereLDSGlobals;
+};
+
+class AMDGPUSwLowerLDS {
+public:
+  AMDGPUSwLowerLDS(Module &mod, DomTreeCallback Callback)
----------------
Pierre-vh wrote:

```suggestion
  AMDGPUSwLowerLDS(Module &Mod, DomTreeCallback Callback)
```
CamelCase

https://github.com/llvm/llvm-project/pull/87265


More information about the llvm-commits mailing list