[llvm] [AMDGPU] Introduce "amdgpu-sw-lower-lds" pass to lower LDS accesses to use device global memory. (PR #87265)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 19 04:57:19 PDT 2024
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/87265
>From 3f53c1096c16ba0393d983c154af337ec0b23576 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 7 Mar 2024 12:40:41 +0530
Subject: [PATCH 01/11] [AMDGPU] Enable amdgpu-sw-lower-lds pass to lower LDS
accesses to use device global memory
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 186 +---
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 865 ++++++++++++++++++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp | 176 ++++
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.h | 24 +
...pu-sw-lower-lds-dynamic-indirect-access.ll | 99 ++
.../amdgpu-sw-lower-lds-dynamic-lds-test.ll | 57 ++
...ds-multi-static-dynamic-indirect-access.ll | 192 ++++
...gpu-sw-lower-lds-multiple-blocks-return.ll | 79 ++
...ower-lds-static-dynamic-indirect-access.ll | 101 ++
...pu-sw-lower-lds-static-dynamic-lds-test.ll | 88 ++
...s-static-indirect-access-function-param.ll | 61 ++
...lower-lds-static-indirect-access-nested.ll | 212 +++++
...gpu-sw-lower-lds-static-indirect-access.ll | 84 ++
.../amdgpu-sw-lower-lds-static-lds-test.ll | 58 ++
18 files changed, 2114 insertions(+), 185 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d887..15ff74f7c53af3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,6 +263,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
bool GlobalOpt;
};
+void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
+extern char &AMDGPUSwLowerLDSLegacyPassID;
+ModulePass *createAMDGPUSwLowerLDSLegacyPass();
+
+struct AMDGPUSwLowerLDSPass : PassInfoMixin<AMDGPUSwLowerLDSPass> {
+ AMDGPUSwLowerLDSPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
class AMDGPUCodeGenPreparePass
: public PassInfoMixin<AMDGPUCodeGenPreparePass> {
private:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index c8bf9dd39e389c..2c7163a7753725 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -212,6 +212,7 @@
#define DEBUG_TYPE "amdgpu-lower-module-lds"
using namespace llvm;
+using namespace AMDGPU;
namespace {
@@ -234,17 +235,6 @@ cl::opt<LoweringKind> LoweringKindLoc(
clEnumValN(LoweringKind::hybrid, "hybrid",
"Lower via mixture of above strategies")));
-bool isKernelLDS(const Function *F) {
- // Some weirdness here. AMDGPU::isKernelCC does not call into
- // AMDGPU::isKernel with the calling conv, it instead calls into
- // isModuleEntryFunction which returns true for more calling conventions
- // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
- // There's also a test that checks that the LDS lowering does not hit on
- // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
- // Putting LDS in the name of the function to draw attention to this.
- return AMDGPU::isKernel(F->getCallingConv());
-}
-
template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
return L->getName() < R->getName();
@@ -305,183 +295,9 @@ class AMDGPULowerModuleLDS {
Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
}
- static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
- // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
- // global may have uses from multiple different functions as a result.
- // This pass specialises LDS variables with respect to the kernel that
- // allocates them.
-
- // This is semantically equivalent to (the unimplemented as slow):
- // for (auto &F : M.functions())
- // for (auto &BB : F)
- // for (auto &I : BB)
- // for (Use &Op : I.operands())
- // if (constantExprUsesLDS(Op))
- // replaceConstantExprInFunction(I, Op);
-
- SmallVector<Constant *> LDSGlobals;
- for (auto &GV : M.globals())
- if (AMDGPU::isLDSVariableToLower(GV))
- LDSGlobals.push_back(&GV);
-
- return convertUsersOfConstantsToInstructions(LDSGlobals);
- }
-
public:
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
- using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
-
- using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
-
- static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
- FunctionVariableMap &kernels,
- FunctionVariableMap &functions) {
-
- // Get uses from the current function, excluding uses by called functions
- // Two output variables to avoid walking the globals list twice
- for (auto &GV : M.globals()) {
- if (!AMDGPU::isLDSVariableToLower(GV)) {
- continue;
- }
-
- for (User *V : GV.users()) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- Function *F = I->getFunction();
- if (isKernelLDS(F)) {
- kernels[F].insert(&GV);
- } else {
- functions[F].insert(&GV);
- }
- }
- }
- }
- }
-
- struct LDSUsesInfoTy {
- FunctionVariableMap direct_access;
- FunctionVariableMap indirect_access;
- };
-
- static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
-
- FunctionVariableMap direct_map_kernel;
- FunctionVariableMap direct_map_function;
- getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
-
- // Collect variables that are used by functions whose address has escaped
- DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
- for (Function &F : M.functions()) {
- if (!isKernelLDS(&F))
- if (F.hasAddressTaken(nullptr,
- /* IgnoreCallbackUses */ false,
- /* IgnoreAssumeLikeCalls */ false,
- /* IgnoreLLVMUsed */ true,
- /* IgnoreArcAttachedCall */ false)) {
- set_union(VariablesReachableThroughFunctionPointer,
- direct_map_function[&F]);
- }
- }
-
- auto functionMakesUnknownCall = [&](const Function *F) -> bool {
- assert(!F->isDeclaration());
- for (const CallGraphNode::CallRecord &R : *CG[F]) {
- if (!R.second->getFunction()) {
- return true;
- }
- }
- return false;
- };
-
- // Work out which variables are reachable through function calls
- FunctionVariableMap transitive_map_function = direct_map_function;
-
- // If the function makes any unknown call, assume the worst case that it can
- // access all variables accessed by functions whose address escaped
- for (Function &F : M.functions()) {
- if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
- if (!isKernelLDS(&F)) {
- set_union(transitive_map_function[&F],
- VariablesReachableThroughFunctionPointer);
- }
- }
- }
-
- // Direct implementation of collecting all variables reachable from each
- // function
- for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || isKernelLDS(&Func))
- continue;
-
- DenseSet<Function *> seen; // catches cycles
- SmallVector<Function *, 4> wip{&Func};
-
- while (!wip.empty()) {
- Function *F = wip.pop_back_val();
-
- // Can accelerate this by referring to transitive map for functions that
- // have already been computed, with more care than this
- set_union(transitive_map_function[&Func], direct_map_function[F]);
-
- for (const CallGraphNode::CallRecord &R : *CG[F]) {
- Function *ith = R.second->getFunction();
- if (ith) {
- if (!seen.contains(ith)) {
- seen.insert(ith);
- wip.push_back(ith);
- }
- }
- }
- }
- }
-
- // direct_map_kernel lists which variables are used by the kernel
- // find the variables which are used through a function call
- FunctionVariableMap indirect_map_kernel;
-
- for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
- continue;
-
- for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
- Function *ith = R.second->getFunction();
- if (ith) {
- set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
- } else {
- set_union(indirect_map_kernel[&Func],
- VariablesReachableThroughFunctionPointer);
- }
- }
- }
-
- // Verify that we fall into one of 2 cases:
- // - All variables are absolute: this is a re-run of the pass
- // so we don't have anything to do.
- // - No variables are absolute.
- std::optional<bool> HasAbsoluteGVs;
- for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
- for (auto &[Fn, GVs] : Map) {
- for (auto *GV : GVs) {
- bool IsAbsolute = GV->isAbsoluteSymbolRef();
- if (HasAbsoluteGVs.has_value()) {
- if (*HasAbsoluteGVs != IsAbsolute) {
- report_fatal_error(
- "Module cannot mix absolute and non-absolute LDS GVs");
- }
- } else
- HasAbsoluteGVs = IsAbsolute;
- }
- }
- }
-
- // If we only had absolute GVs, we have nothing to do, return an empty
- // result.
- if (HasAbsoluteGVs && *HasAbsoluteGVs)
- return {FunctionVariableMap(), FunctionVariableMap()};
-
- return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
- }
-
struct LDSVariableReplacement {
GlobalVariable *SGV = nullptr;
DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 90f36fadf35903..eda4949d0296d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
AMDGPULowerBufferFatPointersPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass())
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
#undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
new file mode 100644
index 00000000000000..ed3670fa1386d6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -0,0 +1,865 @@
+//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the local data store, LDS, uses in kernel and non-kernel
+// functions in module with dynamically allocated device global memory.
+//
+// Replacement of Kernel LDS accesses:
+// For a kernel, LDS access can be static or dynamic which are direct
+// (accessed within kernel) and indirect (accessed through non-kernels).
+// A device global memory equal to size of all these LDS globals will be
+// allocated. At the prologue of the kernel, a single work-item from the
+// work-group, does a "malloc" and stores the pointer of the allocation in
+// new LDS global that will be created for the kernel. This will be called
+// "malloc LDS global" in this pass.
+// Each LDS access corresponds to an offset in the allocated memory.
+// All static LDS accesses will be allocated first and then dynamic LDS
+// will occupy the device global memoery.
+// To store the offsets corresponding to all LDS accesses, another global
+// variable is created which will be called "metadata global" in this pass.
+// - Malloc LDS Global:
+// It is LDS global of ptr type with name
+// "llvm.amdgcn.sw.lds.<kernel-name>".
+// - Metadata Global:
+// It is of struct type, with n members. n equals the number of LDS
+// globals accessed by the kernel(direct and indirect). Each member of
+// struct is another struct of type {i32, i32}. First member corresponds
+// to offset, second member corresponds to size of LDS global being
+// replaced. It will have name "llvm.amdgcn.sw.lds.<kernel-name>.md".
+// This global will have an intializer with static LDS related offsets
+// and sizes initialized. But for dynamic LDS related entries, offsets
+// will be intialized to previous static LDS allocation end offset. Sizes
+// for them will be zero initially. These dynamic LDS offset and size
+// values will be updated with in the kernel, since kernel can read the
+// dynamic LDS size allocation done at runtime with query to
+// "hidden_dynamic_lds_size" hidden kernel argument.
+//
+// LDS accesses within the kernel will be replaced by "gep" ptr to
+// corresponding offset into allocated device global memory for the kernel.
+// At the epilogue of kernel, allocated memory would be made free by the same
+// single work-item.
+//
+// Replacement of non-kernel LDS accesses:
+// Multiple kernels can access the same non-kernel function.
+// All the kernels accessing LDS through non-kernels are sorted and
+// assigned a kernel-id. All the LDS globals accessed by non-kernels
+// are sorted. This information is used to build two tables:
+// - Base table:
+// Base table will have single row, with elements of the row
+// placed as per kernel ID. Each element in the row corresponds
+// to addresss of "malloc LDS global" variable created for
+// that kernel.
+// - Offset table:
+// Offset table will have multiple rows and columns.
+// Rows are assumed to be from 0 to (n-1). n is total number
+// of kernels accessing the LDS through non-kernels.
+// Each row will have m elements. m is the total number of
+// unique LDS globals accessed by all non-kernels.
+// Each element in the row correspond to the address of
+// the replacement of LDS global done by that particular kernel.
+// A LDS variable in non-kernel will be replaced based on the information
+// from base and offset tables. Based on kernel-id query, address of "malloc
+// LDS global" for that corresponding kernel is obtained from base table.
+// The Offset into the base "malloc LDS global" is obtained from
+// corresponding element in offset table. With this information, replacement
+// value is obtained.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-sw-lower-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
+
+struct LDSAccessTypeInfo {
+ SetVector<GlobalVariable *> StaticLDSGlobals;
+ SetVector<GlobalVariable *> DynamicLDSGlobals;
+};
+
+// Struct to hold all the Metadata required for a kernel
+// to replace a LDS global uses with corresponding offset
+// in to device global memory.
+struct KernelLDSParameters {
+ GlobalVariable *MallocLDSGlobal{nullptr};
+ GlobalVariable *MallocMetadataGlobal{nullptr};
+ LDSAccessTypeInfo DirectAccess;
+ LDSAccessTypeInfo IndirectAccess;
+ DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
+ LDSToReplacementIndicesMap;
+ int32_t KernelId{-1};
+ uint32_t MallocSize{0};
+};
+
+// Struct to store infor for creation of offset table
+// for all the non-kernel LDS accesses.
+struct NonKernelLDSParameters {
+ GlobalVariable *LDSBaseTable{nullptr};
+ GlobalVariable *LDSOffsetTable{nullptr};
+ SetVector<Function *> OrderedKernels;
+ SetVector<GlobalVariable *> OrdereLDSGlobals;
+};
+
+class AMDGPUSwLowerLDS {
+public:
+ AMDGPUSwLowerLDS(Module &mod, DomTreeCallback Callback)
+ : M(mod), IRB(M.getContext()), DTCallback(Callback) {}
+ bool Run();
+ void GetUsesOfLDSByNonKernels(CallGraph const &CG,
+ FunctionVariableMap &functions);
+ SetVector<Function *>
+ GetOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
+ SetVector<GlobalVariable *>
+ GetOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &&Variables);
+ void PopulateMallocLDSGlobal(Function *Func);
+ void PopulateMallocMetadataGlobal(Function *Func);
+ void PopulateLDSToReplacementIndicesMap(Function *Func);
+ void ReplaceKernelLDSAccesses(Function *Func);
+ void LowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
+ void BuildNonKernelLDSOffsetTable(
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
+ void BuildNonKernelLDSBaseTable(
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
+ Constant *
+ GetAddressesOfVariablesInKernel(Function *Func,
+ SetVector<GlobalVariable *> &Variables);
+ void LowerNonKernelLDSAccesses(
+ Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
+
+private:
+ Module &M;
+ IRBuilder<> IRB;
+ DomTreeCallback DTCallback;
+ DenseMap<Function *, std::shared_ptr<KernelLDSParameters>>
+ KernelToLDSParametersMap;
+};
+
+template <typename T> SetVector<T> SortByName(std::vector<T> &&V) {
+ // Sort the vector of globals or Functions based on their name.
+ // Returns a SetVector of globals/Functions.
+ llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(SetVector<T>(V.begin(), V.end()))};
+}
+
+SetVector<GlobalVariable *> AMDGPUSwLowerLDS::GetOrderedNonKernelAllLDSGlobals(
+ SetVector<GlobalVariable *> &&Variables) {
+ // Sort all the non-kernel LDS accesses based on theor name.
+ SetVector<GlobalVariable *> Ordered = SortByName(
+ std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
+ return std::move(Ordered);
+}
+
+SetVector<Function *> AMDGPUSwLowerLDS::GetOrderedIndirectLDSAccessingKernels(
+ SetVector<Function *> &&Kernels) {
+ // Sort the non-kernels accessing LDS based on theor name.
+ // Also assign a kernel ID metadata based on the sorted order.
+ LLVMContext &Ctx = M.getContext();
+ if (Kernels.size() > UINT32_MAX) {
+ // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+ report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
+ }
+ SetVector<Function *> OrderedKernels =
+ SortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
+ for (size_t i = 0; i < Kernels.size(); i++) {
+ Metadata *AttrMDArgs[1] = {
+ ConstantAsMetadata::get(IRB.getInt32(i)),
+ };
+ Function *Func = OrderedKernels[i];
+ Func->setMetadata("llvm.amdgcn.lds.kernel.id",
+ MDNode::get(Ctx, AttrMDArgs));
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ LDSParams->KernelId = i;
+ }
+ return std::move(OrderedKernels);
+}
+
+void AMDGPUSwLowerLDS::GetUsesOfLDSByNonKernels(
+ CallGraph const &CG, FunctionVariableMap &functions) {
+ // Get uses from the current function, excluding uses by called functions
+ // Two output variables to avoid walking the globals list twice
+ for (auto &GV : M.globals()) {
+ if (!AMDGPU::isLDSVariableToLower(GV)) {
+ continue;
+ }
+
+ if (GV.isAbsoluteSymbolRef()) {
+ report_fatal_error(
+ "LDS variables with absolute addresses are unimplemented.");
+ }
+
+ for (User *V : GV.users()) {
+ User *FUU = V;
+ bool isCast = isa<BitCastOperator, AddrSpaceCastOperator>(FUU);
+ if (isCast && FUU->hasOneUse() && !FUU->user_begin()->user_empty())
+ FUU = *FUU->user_begin();
+ if (auto *I = dyn_cast<Instruction>(FUU)) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F)) {
+ functions[F].insert(&GV);
+ }
+ }
+ }
+ }
+}
+
+void AMDGPUSwLowerLDS::PopulateMallocLDSGlobal(Function *Func) {
+ // Create new LDS global required for each kernel to store
+ // device global memory pointer.
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ // create new global pointer variable
+ LDSParams->MallocLDSGlobal = new GlobalVariable(
+ M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
+ PoisonValue::get(IRB.getPtrTy()),
+ Twine("llvm.amdgcn.sw.lds." + Func->getName()), nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+ return;
+}
+
+void AMDGPUSwLowerLDS::PopulateMallocMetadataGlobal(Function *Func) {
+ // Create new metadata global for every kernel and initialize the
+ // start offsets and sizes corresponding to each LDS accesses.
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ auto &Ctx = M.getContext();
+ auto &DL = M.getDataLayout();
+ std::vector<Type *> Items;
+ Type *Int32Ty = IRB.getInt32Ty();
+ std::vector<Constant *> Initializers;
+ Align MaxAlignment(1);
+ auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
+ uint32_t GVAlignValue = GV->getAlignment();
+ Align GVAlign =
+ GVAlignValue ? Align(GVAlignValue) : AMDGPU::getAlign(DL, GV);
+ MaxAlignment = std::max(MaxAlignment, GVAlign);
+ };
+
+ for (GlobalVariable *GV : LDSParams->DirectAccess.StaticLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams->DirectAccess.DynamicLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams->IndirectAccess.StaticLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams->IndirectAccess.DynamicLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ uint32_t MaxAlignValue = MaxAlignment.value();
+
+ //{StartOffset, SizeInBytes}
+ StructType *LDSItemTy = StructType::create(
+ Ctx, {Int32Ty, Int32Ty},
+ "llvm.amdgcn.sw.lds." + Func->getName().str() + ".md.item");
+
+ auto InitializerLamda = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ Type *Ty = GV->getValueType();
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ Items.push_back(LDSItemTy);
+ Constant *ItemStartOffset =
+ ConstantInt::get(Int32Ty, LDSParams->MallocSize);
+ Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
+ uint64_t AlignedSize =
+ ((SizeInBytes + MaxAlignValue - 1) / MaxAlignValue) * MaxAlignValue;
+ LDSParams->MallocSize += AlignedSize;
+ Constant *InitItem =
+ ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst});
+ Initializers.push_back(InitItem);
+ }
+ };
+
+ InitializerLamda(LDSParams->DirectAccess.StaticLDSGlobals);
+ InitializerLamda(LDSParams->IndirectAccess.StaticLDSGlobals);
+ InitializerLamda(LDSParams->DirectAccess.DynamicLDSGlobals);
+ InitializerLamda(LDSParams->IndirectAccess.DynamicLDSGlobals);
+
+ StructType *MetadataStructType = StructType::create(
+ Ctx, Items, ("llvm.amdgcn.sw.lds." + Func->getName().str() + ".md.type"));
+ LDSParams->MallocMetadataGlobal = new GlobalVariable(
+ M, MetadataStructType, false, GlobalValue::InternalLinkage,
+ PoisonValue::get(MetadataStructType),
+ ("llvm.amdgcn.sw.lds." + Func->getName().str() + ".md"), nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
+ Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
+ LDSParams->MallocMetadataGlobal->setInitializer(data);
+ LDSParams->MallocMetadataGlobal->setAlignment(MaxAlignment);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ LDSParams->MallocMetadataGlobal->setSanitizerMetadata(MD);
+ return;
+}
+
+void AMDGPUSwLowerLDS::PopulateLDSToReplacementIndicesMap(Function *Func) {
+ // Fill the corresponding LDS replacement indices for each LDS access
+ // related to this kernel.
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
+ uint32_t &Idx) {
+ for (auto &GV : LDSGlobals) {
+ LDSParams->LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
+ ++Idx;
+ }
+ };
+ uint32_t Idx = 0;
+ PopulateIndices(LDSParams->DirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams->IndirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams->DirectAccess.DynamicLDSGlobals, Idx);
+ PopulateIndices(LDSParams->IndirectAccess.DynamicLDSGlobals, Idx);
+ return;
+}
+
+static void ReplacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
+ Value *Replacement) {
+ // Replace all uses of LDS global in this Function with a Replacement.
+ auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
+ auto *FUU = U.getUser();
+ bool isCast = isa<BitCastOperator, AddrSpaceCastOperator>(FUU);
+ if (isCast && FUU->hasOneUse() && !FUU->user_begin()->user_empty())
+ FUU = *FUU->user_begin();
+ if (auto *inst = llvm::dyn_cast<Instruction>(FUU)) {
+ auto *Func1 = inst->getParent()->getParent();
+ if (Func == Func1)
+ return true;
+ }
+ return false;
+ };
+ GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
+ return;
+}
+
+void AMDGPUSwLowerLDS::ReplaceKernelLDSAccesses(Function *Func) {
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ assert(MallocLDSGlobal);
+ GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ assert(MallocMetadataGlobal);
+ StructType *MallocMetadataStructType =
+ cast<StructType>(MallocMetadataGlobal->getValueType());
+ Type *Int32Ty = IRB.getInt32Ty();
+
+ // Replace all uses of LDS global in this Function with a Replacement.
+ auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ // Do not generate instructions if LDS access is in non-kernel
+ // i.e indirect-access.
+ if ((LDSParams->IndirectAccess.StaticLDSGlobals.contains(GV) ||
+ LDSParams->IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
+ (!LDSParams->DirectAccess.StaticLDSGlobals.contains(GV) &&
+ !LDSParams->DirectAccess.DynamicLDSGlobals.contains(GV)))
+ continue;
+ auto &Indices = LDSParams->LDSToReplacementIndicesMap[GV];
+ assert(Indices.size() == 3);
+ uint32_t Idx0 = Indices[0];
+ uint32_t Idx1 = Indices[1];
+ uint32_t Idx2 = Indices[2];
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Idx0),
+ ConstantInt::get(Int32Ty, Idx1),
+ ConstantInt::get(Int32Ty, Idx2)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(
+ MallocMetadataStructType, MallocMetadataGlobal, GEPIdx, true);
+ Value *Load = IRB.CreateLoad(Int32Ty, GEP);
+ Value *BasePlusOffset =
+ IRB.CreateInBoundsGEP(GV->getType(), MallocLDSGlobal, {Load});
+ ReplacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ }
+ };
+ ReplaceLDSGlobalUses(LDSParams->DirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams->IndirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams->DirectAccess.DynamicLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams->IndirectAccess.DynamicLDSGlobals);
+ return;
+}
+
+void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
+ DomTreeUpdater &DTU) {
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ auto &Ctx = M.getContext();
+ auto *PrevEntryBlock = &Func->getEntryBlock();
+
+ // Create malloc block.
+ auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
+
+ // Create WIdBlock block which has instructions related to selection of
+ // {0,0,0} indiex work item in the work group.
+ auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
+ IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
+ auto *const WIdx =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
+ auto *const WIdy =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {});
+ auto *const WIdz =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {});
+ auto *const XYOr = IRB.CreateOr(WIdx, WIdy);
+ auto *const XYZOr = IRB.CreateOr(XYOr, WIdz);
+ auto *const WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
+
+ GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ assert(MallocMetadataGlobal);
+ StructType *MetadataStructType =
+ cast<StructType>(MallocMetadataGlobal->getValueType());
+
+ // All work items will branch to PrevEntryBlock except {0,0,0} index
+ // work item which will branch to malloc block.
+ IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
+
+ // Malloc block
+ IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
+
+ // If Dynamic LDS globals are accessed by the kernel,
+ // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
+ // Update the corresponding metadata global entries for this dyn lds global.
+ uint32_t MallocSize = LDSParams->MallocSize;
+ Value *CurrMallocSize = IRB.getInt64(MallocSize);
+ if (!LDSParams->DirectAccess.DynamicLDSGlobals.empty() ||
+ !LDSParams->IndirectAccess.DynamicLDSGlobals.empty()) {
+ unsigned MaxAlignment = MallocMetadataGlobal->getAlignment();
+ Value *MaxAlignValue = IRB.getInt64(MaxAlignment);
+ Value *MaxAlignValueMinusOne = IRB.getInt64(MaxAlignment - 1);
+ auto MallocSizeCalcLambda =
+ [&](SetVector<GlobalVariable *> &DynamicLDSGlobals) {
+ for (GlobalVariable *DynGV : DynamicLDSGlobals) {
+ auto &Indices = LDSParams->LDSToReplacementIndicesMap[DynGV];
+ // Get size from hidden dyn_lds_size argument of kernel int
+ // CurrDynLDSSize
+ Value *ImplicitArg =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+ Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
+ ImplicitArg->getType(), ImplicitArg, {IRB.getInt32(15)});
+ Value *CurrDynLDSSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), HiddenDynLDSSize);
+ auto *GEPForOffset = IRB.CreateInBoundsGEP(
+ MetadataStructType, MallocMetadataGlobal,
+ {IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(0)});
+ IRB.CreateStore(CurrMallocSize, GEPForOffset);
+
+ auto *GEPForSize = IRB.CreateInBoundsGEP(
+ MetadataStructType, MallocMetadataGlobal,
+ {IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(1)});
+ IRB.CreateStore(CurrDynLDSSize, GEPForSize);
+ CurrMallocSize = IRB.CreateAdd(CurrMallocSize, CurrDynLDSSize);
+ CurrMallocSize =
+ IRB.CreateAdd(CurrMallocSize, MaxAlignValueMinusOne);
+ CurrMallocSize = IRB.CreateUDiv(CurrMallocSize, MaxAlignValue);
+ CurrMallocSize = IRB.CreateMul(CurrMallocSize, MaxAlignValue);
+ }
+ };
+ MallocSizeCalcLambda(LDSParams->DirectAccess.DynamicLDSGlobals);
+ MallocSizeCalcLambda(LDSParams->IndirectAccess.DynamicLDSGlobals);
+ }
+
+ // Create a call to malloc function which does device global memory allocation
+ // with size equals to all LDS global accesses size in this kernel.
+ const char MallocImplName[] = "malloc";
+ FunctionCallee AMDGPUMallocReturn = M.getOrInsertFunction(
+ MallocImplName,
+ FunctionType::get(IRB.getPtrTy(1), {IRB.getInt64Ty()}, false));
+ Value *MCI = IRB.CreateCall(AMDGPUMallocReturn, {CurrMallocSize});
+
+ GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ assert(MallocLDSGlobal);
+
+ // create load of malloc to new global
+ IRB.CreateStore(MCI, MallocLDSGlobal);
+
+ // Create branch to PrevEntryBlock
+ IRB.CreateBr(PrevEntryBlock);
+
+ // Create wave-group barrier at the starting of Previous entry block
+ Type *Int1Ty = IRB.getInt1Ty();
+ IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
+ auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
+ XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
+ XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
+
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ ReplaceKernelLDSAccesses(Func);
+
+ auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
+ auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
+ auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
+ for (BasicBlock &BB : *Func) {
+ if (!BB.empty()) {
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
+ BasicBlock *Block = &BB;
+ RI->eraseFromParent();
+ IRB.SetInsertPoint(&BB, BB.end());
+ IRB.CreateBr(CondFreeBlock);
+ }
+ }
+ }
+
+ // Cond Free Block
+ IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+ IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
+
+ // Free Block
+ IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
+
+ // Free the previously allocate device global memory.
+ const char FreeImplName[] = "free";
+ FunctionCallee AMDGPUFreeReturn = M.getOrInsertFunction(
+ FreeImplName,
+ FunctionType::get(IRB.getVoidTy(), {IRB.getPtrTy()}, false));
+
+ Value *MallocPtr = IRB.CreateLoad(IRB.getPtrTy(), MallocLDSGlobal);
+ IRB.CreateCall(AMDGPUFreeReturn, {MallocPtr});
+ IRB.CreateBr(EndBlock);
+
+ // End Block
+ IRB.SetInsertPoint(EndBlock, EndBlock->begin());
+ IRB.CreateRetVoid();
+ // Update the DomTree with corresponding links to basic blocks.
+ DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
+ {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
+ {DominatorTree::Insert, CondFreeBlock, FreeBlock},
+ {DominatorTree::Insert, FreeBlock, EndBlock}});
+ return;
+}
+
+Constant *AMDGPUSwLowerLDS::GetAddressesOfVariablesInKernel(
+ Function *Func, SetVector<GlobalVariable *> &Variables) {
+ LLVMContext &Ctx = M.getContext();
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+
+ GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ assert(MallocMetadataGlobal);
+ StructType *MallocMetadataStructType =
+ cast<StructType>(MallocMetadataGlobal->getValueType());
+ ArrayType *KernelOffsetsType = ArrayType::get(Int32Ty, Variables.size());
+
+ SmallVector<Constant *> Elements;
+ for (size_t i = 0; i < Variables.size(); i++) {
+ GlobalVariable *GV = Variables[i];
+ assert(GV);
+ if (LDSParams->LDSToReplacementIndicesMap.contains(GV)) {
+ auto &Indices = LDSParams->LDSToReplacementIndicesMap[GV];
+ uint32_t Idx0 = Indices[0];
+ uint32_t Idx1 = Indices[1];
+ uint32_t Idx2 = Indices[2];
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Idx0),
+ ConstantInt::get(Int32Ty, Idx1),
+ ConstantInt::get(Int32Ty, Idx2)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(
+ MallocMetadataStructType, MallocMetadataGlobal, GEPIdx, true);
+ auto elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
+ Elements.push_back(elt);
+ } else
+ Elements.push_back(PoisonValue::get(Int32Ty));
+ }
+ return ConstantArray::get(KernelOffsetsType, Elements);
+}
+
+void AMDGPUSwLowerLDS::BuildNonKernelLDSBaseTable(
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+ // Base table will have single row, with elements of the row
+ // placed as per kernel ID. Each element in the row corresponds
+ // to addresss of malloc LDS global variable of the kernel.
+ auto &Kernels = NKLDSParams->OrderedKernels;
+ assert(!Kernels.empty());
+ LLVMContext &Ctx = M.getContext();
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ const size_t NumberKernels = Kernels.size();
+ ArrayType *AllKernelsOffsetsType = ArrayType::get(Int32Ty, NumberKernels);
+ std::vector<Constant *> overallConstantExprElts(NumberKernels);
+ for (size_t i = 0; i < NumberKernels; i++) {
+ Function *Func = Kernels[i];
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ assert(MallocLDSGlobal);
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(
+ MallocLDSGlobal->getType(), MallocLDSGlobal, GEPIdx, true);
+ auto Elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
+ overallConstantExprElts[i] = Elt;
+ }
+ Constant *init =
+ ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
+ NKLDSParams->LDSBaseTable = new GlobalVariable(
+ M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
+ "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::CONSTANT_ADDRESS);
+}
+
+void AMDGPUSwLowerLDS::BuildNonKernelLDSOffsetTable(
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+ // Offset table will have multiple rows and columns.
+ // Rows are assumed to be from 0 to (n-1). n is total number
+ // of kernels accessing the LDS through non-kernels.
+ // Each row will have m elements. m is the total number of
+ // unique LDS globals accessed by non-kernels.
+ // Each element in the row correspond to the address of
+ // the replacement of LDS global done by that particular kernel.
+ auto &Variables = NKLDSParams->OrdereLDSGlobals;
+ auto &Kernels = NKLDSParams->OrderedKernels;
+ assert(!Variables.empty());
+ assert(!Kernels.empty());
+ LLVMContext &Ctx = M.getContext();
+ const size_t NumberVariables = Variables.size();
+ const size_t NumberKernels = Kernels.size();
+
+ ArrayType *KernelOffsetsType =
+ ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
+
+ ArrayType *AllKernelsOffsetsType =
+ ArrayType::get(KernelOffsetsType, NumberKernels);
+ // Constant *Missing = PoisonValue::get(KernelOffsetsType);
+ std::vector<Constant *> overallConstantExprElts(NumberKernels);
+ for (size_t i = 0; i < NumberKernels; i++) {
+ Function *Func = Kernels[i];
+ overallConstantExprElts[i] =
+ GetAddressesOfVariablesInKernel(Func, Variables);
+ }
+ Constant *init =
+ ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
+ NKLDSParams->LDSOffsetTable = new GlobalVariable(
+ M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
+ "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::CONSTANT_ADDRESS);
+ return;
+}
+
+void AMDGPUSwLowerLDS::LowerNonKernelLDSAccesses(
+ Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
+ std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+ // Replace LDS access in non-kernel with replacement queried from
+ // Base table and offset from offset table.
+ auto *EntryBlock = &Func->getEntryBlock();
+ IRB.SetInsertPoint(EntryBlock, EntryBlock->begin());
+ Function *Decl =
+ Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+ auto *KernelId = IRB.CreateCall(Decl, {});
+ GlobalVariable *LDSBaseTable = NKLDSParams->LDSBaseTable;
+ GlobalVariable *LDSOffsetTable = NKLDSParams->LDSOffsetTable;
+ auto &OrdereLDSGlobals = NKLDSParams->OrdereLDSGlobals;
+ assert(LDSBaseTable && LDSOffsetTable);
+ Value *BaseGEP = IRB.CreateInBoundsGEP(
+ LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
+ Value *BaseLoad = IRB.CreateLoad(IRB.getInt32Ty(), BaseGEP);
+
+ for (GlobalVariable *GV : LDSGlobals) {
+ Value *BasePtr = IRB.CreateIntToPtr(BaseLoad, GV->getType());
+ auto GVIt = std::find(OrdereLDSGlobals.begin(), OrdereLDSGlobals.end(), GV);
+ assert(GVIt != OrdereLDSGlobals.end());
+ uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
+ Value *OffsetGEP = IRB.CreateInBoundsGEP(
+ LDSOffsetTable->getValueType(), LDSOffsetTable,
+ {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
+ Value *OffsetLoad = IRB.CreateLoad(IRB.getInt32Ty(), OffsetGEP);
+ OffsetLoad = IRB.CreateIntToPtr(OffsetLoad, GV->getType());
+ OffsetLoad = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
+ Value *BasePlusOffset =
+ IRB.CreateInBoundsGEP(GV->getType(), BasePtr, {OffsetLoad});
+ ReplacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ }
+ return;
+}
+
+static void ReorderStaticDynamicIndirectLDSSet(
+ std::shared_ptr<KernelLDSParameters> &LDSParams) {
+ // Sort Static, dynamic LDS globals which are either
+ // direct or indirect access on basis of name.
+ assert(LDSParams);
+ LDSParams->DirectAccess.StaticLDSGlobals =
+ SortByName(std::vector<GlobalVariable *>(
+ LDSParams->DirectAccess.StaticLDSGlobals.begin(),
+ LDSParams->DirectAccess.StaticLDSGlobals.end()));
+ LDSParams->DirectAccess.DynamicLDSGlobals =
+ SortByName(std::vector<GlobalVariable *>(
+ LDSParams->DirectAccess.DynamicLDSGlobals.begin(),
+ LDSParams->DirectAccess.DynamicLDSGlobals.end()));
+ LDSParams->IndirectAccess.StaticLDSGlobals =
+ SortByName(std::vector<GlobalVariable *>(
+ LDSParams->IndirectAccess.StaticLDSGlobals.begin(),
+ LDSParams->IndirectAccess.StaticLDSGlobals.end()));
+ LDSParams->IndirectAccess.DynamicLDSGlobals =
+ SortByName(std::vector<GlobalVariable *>(
+ LDSParams->IndirectAccess.DynamicLDSGlobals.begin(),
+ LDSParams->IndirectAccess.DynamicLDSGlobals.end()));
+ return;
+}
+
+bool AMDGPUSwLowerLDS::Run() {
+ bool Changed = false;
+ CallGraph CG = CallGraph(M);
+ SetVector<Function *> KernelsWithIndirectLDSAccess;
+ FunctionVariableMap NonKernelToLDSAccessMap;
+ SetVector<GlobalVariable *> AllNonKernelLDSAccess;
+
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // Get all the direct and indirect access of LDS for all the kernels.
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // Get the Uses of LDS from non-kernels.
+ GetUsesOfLDSByNonKernels(CG, NonKernelToLDSAccessMap);
+
+ // Utility to group LDS access into direct, indirect, static and dynamic.
+ auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
+ bool DirectAccess) {
+ for (auto &K : LDSAccesses) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ assert(!K.second.empty());
+
+ if (!KernelToLDSParametersMap.contains(F))
+ KernelToLDSParametersMap[F] = std::make_shared<KernelLDSParameters>();
+
+ auto &LDSParams = KernelToLDSParametersMap[F];
+ assert(LDSParams);
+ if (!DirectAccess)
+ KernelsWithIndirectLDSAccess.insert(F);
+ for (GlobalVariable *GV : K.second) {
+ if (!DirectAccess) {
+ if (AMDGPU::isDynamicLDS(*GV))
+ LDSParams->IndirectAccess.DynamicLDSGlobals.insert(GV);
+ else
+ LDSParams->IndirectAccess.StaticLDSGlobals.insert(GV);
+ AllNonKernelLDSAccess.insert(GV);
+ } else {
+ if (AMDGPU::isDynamicLDS(*GV))
+ LDSParams->DirectAccess.DynamicLDSGlobals.insert(GV);
+ else
+ LDSParams->DirectAccess.StaticLDSGlobals.insert(GV);
+ }
+ }
+ }
+ };
+
+ PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
+ PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
+
+ for (auto &K : KernelToLDSParametersMap) {
+ Function *Func = K.first;
+ auto &LDSParams = KernelToLDSParametersMap[Func];
+ assert(LDSParams);
+ if (LDSParams->DirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams->DirectAccess.DynamicLDSGlobals.empty() &&
+ LDSParams->IndirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams->IndirectAccess.DynamicLDSGlobals.empty()) {
+ Changed = false;
+ } else {
+ ReorderStaticDynamicIndirectLDSSet(LDSParams);
+ PopulateMallocLDSGlobal(Func);
+ PopulateMallocMetadataGlobal(Func);
+ PopulateLDSToReplacementIndicesMap(Func);
+ DomTreeUpdater DTU(DTCallback(*Func),
+ DomTreeUpdater::UpdateStrategy::Lazy);
+ LowerKernelLDSAccesses(Func, DTU);
+ Changed = true;
+ }
+ }
+
+ std::shared_ptr<NonKernelLDSParameters> NKLDSParams =
+ std::make_shared<NonKernelLDSParameters>();
+ if (!NonKernelToLDSAccessMap.empty()) {
+ assert(NKLDSParams);
+ NKLDSParams->OrderedKernels = GetOrderedIndirectLDSAccessingKernels(
+ std::move(KernelsWithIndirectLDSAccess));
+ NKLDSParams->OrdereLDSGlobals =
+ GetOrderedNonKernelAllLDSGlobals(std::move(AllNonKernelLDSAccess));
+ assert(!NKLDSParams->OrderedKernels.empty());
+ assert(!NKLDSParams->OrdereLDSGlobals.empty());
+ BuildNonKernelLDSBaseTable(NKLDSParams);
+ BuildNonKernelLDSOffsetTable(NKLDSParams);
+ for (auto &K : NonKernelToLDSAccessMap) {
+ Function *Func = K.first;
+ DenseSet<GlobalVariable *> &LDSGlobals = K.second;
+ SetVector<GlobalVariable *> OrderedLDSGlobals = SortByName(
+ std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
+ LowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
+ }
+ }
+ return Changed;
+}
+
+class AMDGPUSwLowerLDSLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPUSwLowerLDSLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+};
+} // namespace
+
+char AMDGPUSwLowerLDSLegacy::ID = 0;
+char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
+
+INITIALIZE_PASS(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
+ "AMDGPU Software lowering of LDS", false, false)
+
+bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
+ DominatorTreeWrapperPass *const DTW =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
+ return DTW ? &DTW->getDomTree() : nullptr;
+ };
+ bool IsChanged = false;
+ AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
+ IsChanged |= SwLowerLDSImpl.Run();
+ return IsChanged;
+}
+
+ModulePass *llvm::createAMDGPUSwLowerLDSLegacyPass() {
+ return new AMDGPUSwLowerLDSLegacy();
+}
+
+PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
+ return &FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ bool IsChanged = false;
+ AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
+ IsChanged |= SwLowerLDSImpl.Run();
+ if (!IsChanged)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b9262..ee7f4f8e3a6fcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -402,6 +402,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
@@ -676,6 +677,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
+
+#if __has_feature(address_sanitizer)
+ EnableLowerModuleLDS = false;
+ PM.addPass(AMDGPUSwLowerLDSPass());
+#endif
});
PB.registerPeepholeEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 48325a0928f93d..139a416d50f291 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -73,6 +73,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUSwLowerLDS.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 25e628e5cbc558..26b3819f7fd566 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -9,13 +9,16 @@
#include "AMDGPUMemoryUtils.h"
#include "AMDGPU.h"
#include "AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/ReplaceConstant.h"
#define DEBUG_TYPE "amdgpu-memory-utils"
@@ -65,6 +68,179 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
return true;
}
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
+ // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
+ // global may have uses from multiple different functions as a result.
+ // This pass specialises LDS variables with respect to the kernel that
+ // allocates them.
+
+ // This is semantically equivalent to (the unimplemented as slow):
+ // for (auto &F : M.functions())
+ // for (auto &BB : F)
+ // for (auto &I : BB)
+ // for (Use &Op : I.operands())
+ // if (constantExprUsesLDS(Op))
+ // replaceConstantExprInFunction(I, Op);
+
+ SmallVector<Constant *> LDSGlobals;
+ for (auto &GV : M.globals())
+ if (AMDGPU::isLDSVariableToLower(GV))
+ LDSGlobals.push_back(&GV);
+ return convertUsersOfConstantsToInstructions(LDSGlobals);
+}
+
+void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
+ FunctionVariableMap &kernels,
+ FunctionVariableMap &functions) {
+ // Get uses from the current function, excluding uses by called functions
+ // Two output variables to avoid walking the globals list twice
+ for (auto &GV : M.globals()) {
+ if (!AMDGPU::isLDSVariableToLower(GV)) {
+ continue;
+ }
+ for (User *V : GV.users()) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F)) {
+ kernels[F].insert(&GV);
+ } else {
+ functions[F].insert(&GV);
+ }
+ }
+ }
+ }
+}
+
+bool isKernelLDS(const Function *F) {
+ // Some weirdness here. AMDGPU::isKernelCC does not call into
+ // AMDGPU::isKernel with the calling conv, it instead calls into
+ // isModuleEntryFunction which returns true for more calling conventions
+ // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
+ // There's also a test that checks that the LDS lowering does not hit on
+ // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
+ // Putting LDS in the name of the function to draw attention to this.
+ return AMDGPU::isKernel(F->getCallingConv());
+}
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
+
+ FunctionVariableMap direct_map_kernel;
+ FunctionVariableMap direct_map_function;
+ getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
+
+ // Collect variables that are used by functions whose address has escaped
+ DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+ for (Function &F : M.functions()) {
+ if (!isKernelLDS(&F))
+ if (F.hasAddressTaken(nullptr,
+ /* IgnoreCallbackUses */ false,
+ /* IgnoreAssumeLikeCalls */ false,
+ /* IgnoreLLVMUsed */ true,
+ /* IgnoreArcAttachedCall */ false)) {
+ set_union(VariablesReachableThroughFunctionPointer,
+ direct_map_function[&F]);
+ }
+ }
+
+ auto functionMakesUnknownCall = [&](const Function *F) -> bool {
+ assert(!F->isDeclaration());
+ for (const CallGraphNode::CallRecord &R : *CG[F]) {
+ if (!R.second->getFunction()) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ // Work out which variables are reachable through function calls
+ FunctionVariableMap transitive_map_function = direct_map_function;
+
+ // If the function makes any unknown call, assume the worst case that it can
+ // access all variables accessed by functions whose address escaped
+ for (Function &F : M.functions()) {
+ if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
+ if (!isKernelLDS(&F)) {
+ set_union(transitive_map_function[&F],
+ VariablesReachableThroughFunctionPointer);
+ }
+ }
+ }
+
+ // Direct implementation of collecting all variables reachable from each
+ // function
+ for (Function &Func : M.functions()) {
+ if (Func.isDeclaration() || isKernelLDS(&Func))
+ continue;
+
+ DenseSet<Function *> seen; // catches cycles
+ SmallVector<Function *, 4> wip{&Func};
+
+ while (!wip.empty()) {
+ Function *F = wip.pop_back_val();
+
+ // Can accelerate this by referring to transitive map for functions that
+ // have already been computed, with more care than this
+ set_union(transitive_map_function[&Func], direct_map_function[F]);
+
+ for (const CallGraphNode::CallRecord &R : *CG[F]) {
+ Function *ith = R.second->getFunction();
+ if (ith) {
+ if (!seen.contains(ith)) {
+ seen.insert(ith);
+ wip.push_back(ith);
+ }
+ }
+ }
+ }
+ }
+
+ // direct_map_kernel lists which variables are used by the kernel
+ // find the variables which are used through a function call
+ FunctionVariableMap indirect_map_kernel;
+
+ for (Function &Func : M.functions()) {
+ if (Func.isDeclaration() || !isKernelLDS(&Func))
+ continue;
+
+ for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
+ Function *ith = R.second->getFunction();
+ if (ith) {
+ set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
+ } else {
+ set_union(indirect_map_kernel[&Func],
+ VariablesReachableThroughFunctionPointer);
+ }
+ }
+ }
+
+ // Verify that we fall into one of 2 cases:
+ // - All variables are absolute: this is a re-run of the pass
+ // so we don't have anything to do.
+ // - No variables are absolute.
+ std::optional<bool> HasAbsoluteGVs;
+ for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
+ for (auto &[Fn, GVs] : Map) {
+ for (auto *GV : GVs) {
+ bool IsAbsolute = GV->isAbsoluteSymbolRef();
+ if (HasAbsoluteGVs.has_value()) {
+ if (*HasAbsoluteGVs != IsAbsolute) {
+ report_fatal_error(
+ "Module cannot mix absolute and non-absolute LDS GVs");
+ }
+ } else
+ HasAbsoluteGVs = IsAbsolute;
+ }
+ }
+ }
+
+ // If we only had absolute GVs, we have nothing to do, return an empty
+ // result.
+ if (HasAbsoluteGVs && *HasAbsoluteGVs)
+ return {FunctionVariableMap(), FunctionVariableMap()};
+
+ return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
+}
+
bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
Instruction *DefInst = Def->getMemoryInst();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index e42b27f8e09e14..a199b927a28b6c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,6 +9,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
namespace llvm {
struct Align;
@@ -19,14 +22,35 @@ class LoadInst;
class MemoryDef;
class MemorySSA;
class Value;
+class Function;
+class CallGraph;
+class Module;
namespace AMDGPU {
+using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
+using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
+
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
bool isDynamicLDS(const GlobalVariable &GV);
bool isLDSVariableToLower(const GlobalVariable &GV);
+struct LDSUsesInfoTy {
+ FunctionVariableMap direct_access;
+ FunctionVariableMap indirect_access;
+};
+
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M);
+
+void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
+ FunctionVariableMap &kernels,
+ FunctionVariableMap &functions);
+
+bool isKernelLDS(const Function *F);
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M);
+
/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
/// if this is actually a memory update or an artificial clobber to facilitate
/// ordering constraints.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..3bc0ac00b5ccaf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check indirect dynamic LDS access through a non-kernel from kernel is lowered correctly.
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
+ at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+
+define void @use_variables() {
+; CHECK-LABEL: define void @use_variables() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META0:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 21:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 2
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP26]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
new file mode 100644
index 00000000000000..28a8cd62053078
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if direct access of dynamic LDS in kernel is lowered correctly.
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [0 x i8] poison, align 4
+ at lds_2 = internal unnamed_addr addrspace(3) global [0 x i32] poison, align 8
+
+;.
+; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [0 x i8] poison, align 4
+; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [0 x i32] poison, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type zeroinitializer, no_sanitize_address, align 8
+;.
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0() {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 0)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 8
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP12]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 8
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..cb5132770d7c1b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check when multiple kernels access the same non-kernel, LDS accesses are lowere correctly.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
+ at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+
+define void @use_variables_1() {
+; CHECK-LABEL: define void @use_variables_1() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define void @use_variables_2() {
+; CHECK-LABEL: define void @use_variables_2() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 1
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP15]], align 2
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META0:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 8, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 8, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 21:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP24]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ ret void
+}
+
+define amdgpu_kernel void @k1() {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
+; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT: [[TMP27:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP27]], i32 15
+; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP28]], align 8
+; CHECK-NEXT: store i64 [[TMP26]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP29]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP26]], [[TMP29]]
+; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP30]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP33]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i32 15
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP22]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 1), align 8
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP19]], [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 28:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP15]]
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP31]]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: call void @use_variables_2()
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP16]], align 4
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP17]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ call void @use_variables_2()
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0}
+; CHECK: [[META1]] = !{i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
new file mode 100644
index 00000000000000..9b92999392c388
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check malloc and free blocks are placed correctly when multiple
+; blocks and branching is present in the function with LDS accesses lowered correctly.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+
+ at lds_1 = internal unnamed_addr addrspace(3) global i32 poison
+ at lds_2 = internal unnamed_addr addrspace(3) global i32 poison
+
+define amdgpu_kernel void @test_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @test_kernel() {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 8)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP11]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(1) [[TMP13]], align 4
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[POSITIVE:%.*]], label [[NEGATIVE:%.*]]
+; CHECK: positive:
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: negative:
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[VAL1]], 0
+; CHECK-NEXT: br i1 [[CMP2]], label [[VAL1_POSITIVE:%.*]], label [[VAL1_NEGATIVE:%.*]]
+; CHECK: val1_positive:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: val1_negative:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP14]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+%val1 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_1 to ptr addrspace(1))
+%val2 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_2 to ptr addrspace(1))
+
+%result = add i32 %val1, %val2
+%cmp = icmp sgt i32 %result, 0
+br i1 %cmp, label %positive, label %negative
+
+positive:
+ret void
+
+negative:
+%cmp2 = icmp sgt i32 %val1, 0
+br i1 %cmp2, label %val1_positive, label %val1_negative
+
+val1_positive:
+ret void
+
+val1_negative:
+ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..c7e548f031cbd7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly when a non-kernel
+; is called from kernel.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
+ at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+
+define void @use_variables() {
+; CHECK-LABEL: define void @use_variables() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META0:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 21:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 2
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP26]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
new file mode 100644
index 00000000000000..7b4c70a1fa1120
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly in kernel.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+ at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
+ at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
+; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+; CHECK: @lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
+; CHECK: @lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 } }, no_sanitize_address, align 8
+;.
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0() {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 21:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]]
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 4
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 8
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP27]], align 4
+; CHECK-NEXT: store i8 8, ptr addrspace(3) [[TMP29]], align 8
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP30]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 8
+ store i8 7, ptr addrspace(3) @lds_3, align 4
+ store i8 8, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
new file mode 100644
index 00000000000000..c5a9ddde58504a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when LDS is passed as function
+; argument to non-kernel.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+
+ at lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
+
+define void @my_function(ptr addrspace(3) %lds_arg) {
+; CHECK-LABEL: define void @my_function(
+; CHECK-SAME: ptr addrspace(3) [[LDS_ARG:%.*]]) {
+; CHECK-NEXT: [[LDS_VAL:%.*]] = load i32, ptr addrspace(3) [[LDS_ARG]], align 4
+; CHECK-NEXT: [[NEW_LDS_VAL:%.*]] = add i32 [[LDS_VAL]], 1
+; CHECK-NEXT: store i32 [[NEW_LDS_VAL]], ptr addrspace(3) [[LDS_ARG]], align 4
+; CHECK-NEXT: ret void
+;
+ %lds_val = load i32, ptr addrspace(3) %lds_arg, align 4
+ %new_lds_val = add i32 %lds_val, 1
+ store i32 %new_lds_val, ptr addrspace(3) %lds_arg, align 4
+ ret void
+}
+
+define amdgpu_kernel void @my_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @my_kernel() {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 4096)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[LDS_PTR:%.*]] = getelementptr [1024 x i32], ptr addrspace(3) [[TMP9]], i32 0, i32 0
+; CHECK-NEXT: call void @my_function(ptr addrspace(3) [[LDS_PTR]])
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP10]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %lds_ptr = getelementptr [1024 x i32], ptr addrspace(3) @lds_var, i32 0, i32 0
+ call void @my_function(ptr addrspace(3) %lds_ptr)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
new file mode 100644
index 00000000000000..0a34427dd08f8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel.
+
+ at A = external addrspace(3) global [8 x ptr]
+ at B = external addrspace(3) global [0 x i32]
+
+define amdgpu_kernel void @kernel_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META0:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 64)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @call_store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP8]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @call_store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_1() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 0, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 14:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP15]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_2() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 64)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP8]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_3() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_3(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 8
+; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 0, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 14:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP15]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define private void @call_store_A() {
+; CHECK-LABEL: define private void @call_store_A() {
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define private void @store_A() {
+; CHECK-LABEL: define private void @store_A() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8
+; CHECK-NEXT: ret void
+;
+ store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null
+ ret void
+}
+
+define private ptr @get_B_ptr() {
+; CHECK-LABEL: define private ptr @get_B_ptr() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: ret ptr [[TMP10]]
+;
+ ret ptr addrspacecast (ptr addrspace(3) @B to ptr)
+}
+;.
+; CHECK: [[META0]] = !{i32 0}
+; CHECK: [[META1]] = !{i32 1}
+; CHECK: [[META2]] = !{i32 2}
+; CHECK: [[META3]] = !{i32 3}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
new file mode 100644
index 00000000000000..535ee62bee00d6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel.
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external unnamed_addr addrspace(3) global [3 x i8], align 4
+ at lds_4 = external unnamed_addr addrspace(3) global [4 x i8], align 8
+
+define void @use_variables() {
+; CHECK-LABEL: define void @use_variables() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4
+; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
+; CHECK-NEXT: ret void
+;
+ %X = addrspacecast ptr addrspace(3) @lds_3 to ptr
+ store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) !llvm.amdgcn.lds.kernel.id [[META0:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 32)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 1
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 2
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP12]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
new file mode 100644
index 00000000000000..5904ab8167dd12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
+
+; Test to check if static LDS accesses in kernel are lowered correctly.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+ at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+
+;.
+; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
+; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 } }, no_sanitize_address, align 8
+;.
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0() {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 16)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 2
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP12]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
>From 486e32fd50f832bab3389840928ea9a87527c1cb Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 5 Apr 2024 17:22:25 +0530
Subject: [PATCH 02/11] [AMDGPU] Update patch based on review comments:1
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 365 ++++++++----------
...pu-sw-lower-lds-dynamic-indirect-access.ll | 16 +-
.../amdgpu-sw-lower-lds-dynamic-lds-test.ll | 44 ++-
...ds-multi-static-dynamic-indirect-access.ll | 27 +-
...gpu-sw-lower-lds-multiple-blocks-return.ll | 7 +-
...ower-lds-static-dynamic-indirect-access.ll | 17 +-
...pu-sw-lower-lds-static-dynamic-lds-test.ll | 25 +-
...s-static-indirect-access-function-param.ll | 3 -
...gpu-sw-lower-lds-static-indirect-access.ll | 10 +-
.../amdgpu-sw-lower-lds-static-lds-test.ll | 11 +-
10 files changed, 236 insertions(+), 289 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index ed3670fa1386d6..6de02b178af9a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -19,7 +19,7 @@
// "malloc LDS global" in this pass.
// Each LDS access corresponds to an offset in the allocated memory.
// All static LDS accesses will be allocated first and then dynamic LDS
-// will occupy the device global memoery.
+// will occupy the device global memory.
// To store the offsets corresponding to all LDS accesses, another global
// variable is created which will be called "metadata global" in this pass.
// - Malloc LDS Global:
@@ -76,6 +76,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/DomTreeUpdater.h"
@@ -87,6 +88,7 @@
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
@@ -132,38 +134,35 @@ class AMDGPUSwLowerLDS {
public:
AMDGPUSwLowerLDS(Module &mod, DomTreeCallback Callback)
: M(mod), IRB(M.getContext()), DTCallback(Callback) {}
- bool Run();
- void GetUsesOfLDSByNonKernels(CallGraph const &CG,
+ bool run();
+ void getUsesOfLDSByNonKernels(CallGraph const &CG,
FunctionVariableMap &functions);
SetVector<Function *>
- GetOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
+ getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
SetVector<GlobalVariable *>
- GetOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &&Variables);
- void PopulateMallocLDSGlobal(Function *Func);
- void PopulateMallocMetadataGlobal(Function *Func);
- void PopulateLDSToReplacementIndicesMap(Function *Func);
- void ReplaceKernelLDSAccesses(Function *Func);
- void LowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
- void BuildNonKernelLDSOffsetTable(
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
- void BuildNonKernelLDSBaseTable(
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
+ getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &&Variables);
+ void populateMallocLDSGlobal(Function *Func);
+ void populateMallocMetadataGlobal(Function *Func);
+ void populateLDSToReplacementIndicesMap(Function *Func);
+ void replaceKernelLDSAccesses(Function *Func);
+ void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
+ void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
+ void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
Constant *
- GetAddressesOfVariablesInKernel(Function *Func,
+ getAddressesOfVariablesInKernel(Function *Func,
SetVector<GlobalVariable *> &Variables);
- void LowerNonKernelLDSAccesses(
- Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams);
+ void lowerNonKernelLDSAccesses(Function *Func,
+ SetVector<GlobalVariable *> &LDSGlobals,
+ NonKernelLDSParameters &NKLDSParams);
private:
Module &M;
IRBuilder<> IRB;
DomTreeCallback DTCallback;
- DenseMap<Function *, std::shared_ptr<KernelLDSParameters>>
- KernelToLDSParametersMap;
+ DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
};
-template <typename T> SetVector<T> SortByName(std::vector<T> &&V) {
+template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
// Sort the vector of globals or Functions based on their name.
// Returns a SetVector of globals/Functions.
llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
@@ -172,15 +171,15 @@ template <typename T> SetVector<T> SortByName(std::vector<T> &&V) {
return {std::move(SetVector<T>(V.begin(), V.end()))};
}
-SetVector<GlobalVariable *> AMDGPUSwLowerLDS::GetOrderedNonKernelAllLDSGlobals(
+SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
SetVector<GlobalVariable *> &&Variables) {
- // Sort all the non-kernel LDS accesses based on theor name.
- SetVector<GlobalVariable *> Ordered = SortByName(
+ // Sort all the non-kernel LDS accesses based on their name.
+ SetVector<GlobalVariable *> Ordered = sortByName(
std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
return std::move(Ordered);
}
-SetVector<Function *> AMDGPUSwLowerLDS::GetOrderedIndirectLDSAccessingKernels(
+SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
SetVector<Function *> &&Kernels) {
// Sort the non-kernels accessing LDS based on theor name.
// Also assign a kernel ID metadata based on the sorted order.
@@ -190,7 +189,7 @@ SetVector<Function *> AMDGPUSwLowerLDS::GetOrderedIndirectLDSAccessingKernels(
report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
}
SetVector<Function *> OrderedKernels =
- SortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
+ sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
for (size_t i = 0; i < Kernels.size(); i++) {
Metadata *AttrMDArgs[1] = {
ConstantAsMetadata::get(IRB.getInt32(i)),
@@ -199,13 +198,12 @@ SetVector<Function *> AMDGPUSwLowerLDS::GetOrderedIndirectLDSAccessingKernels(
Func->setMetadata("llvm.amdgcn.lds.kernel.id",
MDNode::get(Ctx, AttrMDArgs));
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
- LDSParams->KernelId = i;
+ LDSParams.KernelId = i;
}
return std::move(OrderedKernels);
}
-void AMDGPUSwLowerLDS::GetUsesOfLDSByNonKernels(
+void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels(
CallGraph const &CG, FunctionVariableMap &functions) {
// Get uses from the current function, excluding uses by called functions
// Two output variables to avoid walking the globals list twice
@@ -220,11 +218,7 @@ void AMDGPUSwLowerLDS::GetUsesOfLDSByNonKernels(
}
for (User *V : GV.users()) {
- User *FUU = V;
- bool isCast = isa<BitCastOperator, AddrSpaceCastOperator>(FUU);
- if (isCast && FUU->hasOneUse() && !FUU->user_begin()->user_empty())
- FUU = *FUU->user_begin();
- if (auto *I = dyn_cast<Instruction>(FUU)) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
if (!isKernelLDS(F)) {
functions[F].insert(&GV);
@@ -234,25 +228,22 @@ void AMDGPUSwLowerLDS::GetUsesOfLDSByNonKernels(
}
}
-void AMDGPUSwLowerLDS::PopulateMallocLDSGlobal(Function *Func) {
+void AMDGPUSwLowerLDS::populateMallocLDSGlobal(Function *Func) {
// Create new LDS global required for each kernel to store
// device global memory pointer.
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
// create new global pointer variable
- LDSParams->MallocLDSGlobal = new GlobalVariable(
+ LDSParams.MallocLDSGlobal = new GlobalVariable(
M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
- PoisonValue::get(IRB.getPtrTy()),
- Twine("llvm.amdgcn.sw.lds." + Func->getName()), nullptr,
- GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+ PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
+ nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
return;
}
-void AMDGPUSwLowerLDS::PopulateMallocMetadataGlobal(Function *Func) {
+void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
// Create new metadata global for every kernel and initialize the
// start offsets and sizes corresponding to each LDS accesses.
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
auto &Ctx = M.getContext();
auto &DL = M.getDataLayout();
std::vector<Type *> Items;
@@ -260,99 +251,98 @@ void AMDGPUSwLowerLDS::PopulateMallocMetadataGlobal(Function *Func) {
std::vector<Constant *> Initializers;
Align MaxAlignment(1);
auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
- uint32_t GVAlignValue = GV->getAlignment();
- Align GVAlign =
- GVAlignValue ? Align(GVAlignValue) : AMDGPU::getAlign(DL, GV);
+ Align GVAlign = AMDGPU::getAlign(DL, GV);
MaxAlignment = std::max(MaxAlignment, GVAlign);
};
- for (GlobalVariable *GV : LDSParams->DirectAccess.StaticLDSGlobals)
+ for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
UpdateMaxAlignment(GV);
- for (GlobalVariable *GV : LDSParams->DirectAccess.DynamicLDSGlobals)
+ for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
UpdateMaxAlignment(GV);
- for (GlobalVariable *GV : LDSParams->IndirectAccess.StaticLDSGlobals)
+ for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
UpdateMaxAlignment(GV);
- for (GlobalVariable *GV : LDSParams->IndirectAccess.DynamicLDSGlobals)
+ for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
UpdateMaxAlignment(GV);
- uint32_t MaxAlignValue = MaxAlignment.value();
-
//{StartOffset, SizeInBytes}
- StructType *LDSItemTy = StructType::create(
- Ctx, {Int32Ty, Int32Ty},
- "llvm.amdgcn.sw.lds." + Func->getName().str() + ".md.item");
+ SmallString<128> MDItemStr;
+ raw_svector_ostream MDItemOS(MDItemStr);
+ MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md.item";
+
+ StructType *LDSItemTy =
+ StructType::create(Ctx, {Int32Ty, Int32Ty}, MDItemOS.str());
- auto InitializerLamda = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ auto buildInitializerForMallocMDGlobal = [&](SetVector<GlobalVariable *>
+ &LDSGlobals) {
for (auto &GV : LDSGlobals) {
Type *Ty = GV->getValueType();
const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
Items.push_back(LDSItemTy);
Constant *ItemStartOffset =
- ConstantInt::get(Int32Ty, LDSParams->MallocSize);
+ ConstantInt::get(Int32Ty, LDSParams.MallocSize);
Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
- uint64_t AlignedSize =
- ((SizeInBytes + MaxAlignValue - 1) / MaxAlignValue) * MaxAlignValue;
- LDSParams->MallocSize += AlignedSize;
+ uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
+ LDSParams.MallocSize += AlignedSize;
Constant *InitItem =
ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst});
Initializers.push_back(InitItem);
}
};
- InitializerLamda(LDSParams->DirectAccess.StaticLDSGlobals);
- InitializerLamda(LDSParams->IndirectAccess.StaticLDSGlobals);
- InitializerLamda(LDSParams->DirectAccess.DynamicLDSGlobals);
- InitializerLamda(LDSParams->IndirectAccess.DynamicLDSGlobals);
+ buildInitializerForMallocMDGlobal(LDSParams.DirectAccess.StaticLDSGlobals);
+ buildInitializerForMallocMDGlobal(LDSParams.IndirectAccess.StaticLDSGlobals);
+ buildInitializerForMallocMDGlobal(LDSParams.DirectAccess.DynamicLDSGlobals);
+ buildInitializerForMallocMDGlobal(LDSParams.IndirectAccess.DynamicLDSGlobals);
+
+ SmallString<128> MDTypeStr;
+ raw_svector_ostream MDTypeOS(MDTypeStr);
+ MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md.type";
- StructType *MetadataStructType = StructType::create(
- Ctx, Items, ("llvm.amdgcn.sw.lds." + Func->getName().str() + ".md.type"));
- LDSParams->MallocMetadataGlobal = new GlobalVariable(
+ StructType *MetadataStructType =
+ StructType::create(Ctx, Items, MDTypeOS.str());
+ LDSParams.MallocMetadataGlobal = new GlobalVariable(
M, MetadataStructType, false, GlobalValue::InternalLinkage,
PoisonValue::get(MetadataStructType),
("llvm.amdgcn.sw.lds." + Func->getName().str() + ".md"), nullptr,
GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
- LDSParams->MallocMetadataGlobal->setInitializer(data);
- LDSParams->MallocMetadataGlobal->setAlignment(MaxAlignment);
+ LDSParams.MallocMetadataGlobal->setInitializer(data);
+ LDSParams.MallocMetadataGlobal->setAlignment(MaxAlignment);
GlobalValue::SanitizerMetadata MD;
MD.NoAddress = true;
- LDSParams->MallocMetadataGlobal->setSanitizerMetadata(MD);
+ LDSParams.MallocMetadataGlobal->setSanitizerMetadata(MD);
return;
}
-void AMDGPUSwLowerLDS::PopulateLDSToReplacementIndicesMap(Function *Func) {
+void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
// Fill the corresponding LDS replacement indices for each LDS access
// related to this kernel.
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
uint32_t &Idx) {
for (auto &GV : LDSGlobals) {
- LDSParams->LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
+ LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
++Idx;
}
};
uint32_t Idx = 0;
- PopulateIndices(LDSParams->DirectAccess.StaticLDSGlobals, Idx);
- PopulateIndices(LDSParams->IndirectAccess.StaticLDSGlobals, Idx);
- PopulateIndices(LDSParams->DirectAccess.DynamicLDSGlobals, Idx);
- PopulateIndices(LDSParams->IndirectAccess.DynamicLDSGlobals, Idx);
+ PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
+ PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
return;
}
-static void ReplacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
+static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
Value *Replacement) {
// Replace all uses of LDS global in this Function with a Replacement.
auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
- auto *FUU = U.getUser();
- bool isCast = isa<BitCastOperator, AddrSpaceCastOperator>(FUU);
- if (isCast && FUU->hasOneUse() && !FUU->user_begin()->user_empty())
- FUU = *FUU->user_begin();
- if (auto *inst = llvm::dyn_cast<Instruction>(FUU)) {
- auto *Func1 = inst->getParent()->getParent();
+ auto *V = U.getUser();
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ auto *Func1 = Inst->getParent()->getParent();
if (Func == Func1)
return true;
}
@@ -362,12 +352,11 @@ static void ReplacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
return;
}
-void AMDGPUSwLowerLDS::ReplaceKernelLDSAccesses(Function *Func) {
+void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
- GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
assert(MallocLDSGlobal);
- GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
assert(MallocMetadataGlobal);
StructType *MallocMetadataStructType =
cast<StructType>(MallocMetadataGlobal->getValueType());
@@ -378,12 +367,12 @@ void AMDGPUSwLowerLDS::ReplaceKernelLDSAccesses(Function *Func) {
for (auto &GV : LDSGlobals) {
// Do not generate instructions if LDS access is in non-kernel
// i.e indirect-access.
- if ((LDSParams->IndirectAccess.StaticLDSGlobals.contains(GV) ||
- LDSParams->IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
- (!LDSParams->DirectAccess.StaticLDSGlobals.contains(GV) &&
- !LDSParams->DirectAccess.DynamicLDSGlobals.contains(GV)))
+ if ((LDSParams.IndirectAccess.StaticLDSGlobals.contains(GV) ||
+ LDSParams.IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
+ (!LDSParams.DirectAccess.StaticLDSGlobals.contains(GV) &&
+ !LDSParams.DirectAccess.DynamicLDSGlobals.contains(GV)))
continue;
- auto &Indices = LDSParams->LDSToReplacementIndicesMap[GV];
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
assert(Indices.size() == 3);
uint32_t Idx0 = Indices[0];
uint32_t Idx1 = Indices[1];
@@ -396,20 +385,19 @@ void AMDGPUSwLowerLDS::ReplaceKernelLDSAccesses(Function *Func) {
Value *Load = IRB.CreateLoad(Int32Ty, GEP);
Value *BasePlusOffset =
IRB.CreateInBoundsGEP(GV->getType(), MallocLDSGlobal, {Load});
- ReplacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
};
- ReplaceLDSGlobalUses(LDSParams->DirectAccess.StaticLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams->IndirectAccess.StaticLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams->DirectAccess.DynamicLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams->IndirectAccess.DynamicLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams.DirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams.IndirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams.DirectAccess.DynamicLDSGlobals);
+ ReplaceLDSGlobalUses(LDSParams.IndirectAccess.DynamicLDSGlobals);
return;
}
-void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
+void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
DomTreeUpdater &DTU) {
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
auto &Ctx = M.getContext();
auto *PrevEntryBlock = &Func->getEntryBlock();
@@ -430,7 +418,7 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
auto *const XYZOr = IRB.CreateOr(XYOr, WIdz);
auto *const WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
- GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
assert(MallocMetadataGlobal);
StructType *MetadataStructType =
cast<StructType>(MallocMetadataGlobal->getValueType());
@@ -445,23 +433,23 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
// If Dynamic LDS globals are accessed by the kernel,
// Get the size of dyn lds from hidden dyn_lds_size kernel arg.
// Update the corresponding metadata global entries for this dyn lds global.
- uint32_t MallocSize = LDSParams->MallocSize;
+ uint32_t MallocSize = LDSParams.MallocSize;
Value *CurrMallocSize = IRB.getInt64(MallocSize);
- if (!LDSParams->DirectAccess.DynamicLDSGlobals.empty() ||
- !LDSParams->IndirectAccess.DynamicLDSGlobals.empty()) {
+ if (!LDSParams.DirectAccess.DynamicLDSGlobals.empty() ||
+ !LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
unsigned MaxAlignment = MallocMetadataGlobal->getAlignment();
Value *MaxAlignValue = IRB.getInt64(MaxAlignment);
Value *MaxAlignValueMinusOne = IRB.getInt64(MaxAlignment - 1);
+ Value *ImplicitArg =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+ Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
+ ImplicitArg->getType(), ImplicitArg, {IRB.getInt32(15)});
auto MallocSizeCalcLambda =
[&](SetVector<GlobalVariable *> &DynamicLDSGlobals) {
for (GlobalVariable *DynGV : DynamicLDSGlobals) {
- auto &Indices = LDSParams->LDSToReplacementIndicesMap[DynGV];
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
// Get size from hidden dyn_lds_size argument of kernel int
// CurrDynLDSSize
- Value *ImplicitArg =
- IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
- Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
- ImplicitArg->getType(), ImplicitArg, {IRB.getInt32(15)});
Value *CurrDynLDSSize =
IRB.CreateLoad(IRB.getInt64Ty(), HiddenDynLDSSize);
auto *GEPForOffset = IRB.CreateInBoundsGEP(
@@ -480,19 +468,18 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
CurrMallocSize = IRB.CreateMul(CurrMallocSize, MaxAlignValue);
}
};
- MallocSizeCalcLambda(LDSParams->DirectAccess.DynamicLDSGlobals);
- MallocSizeCalcLambda(LDSParams->IndirectAccess.DynamicLDSGlobals);
+ MallocSizeCalcLambda(LDSParams.DirectAccess.DynamicLDSGlobals);
+ MallocSizeCalcLambda(LDSParams.IndirectAccess.DynamicLDSGlobals);
}
// Create a call to malloc function which does device global memory allocation
// with size equals to all LDS global accesses size in this kernel.
- const char MallocImplName[] = "malloc";
FunctionCallee AMDGPUMallocReturn = M.getOrInsertFunction(
- MallocImplName,
+ StringRef("malloc"),
FunctionType::get(IRB.getPtrTy(1), {IRB.getInt64Ty()}, false));
Value *MCI = IRB.CreateCall(AMDGPUMallocReturn, {CurrMallocSize});
- GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
assert(MallocLDSGlobal);
// create load of malloc to new global
@@ -510,7 +497,7 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
- ReplaceKernelLDSAccesses(Func);
+ replaceKernelLDSAccesses(Func);
auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
@@ -518,7 +505,6 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
for (BasicBlock &BB : *Func) {
if (!BB.empty()) {
if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
- BasicBlock *Block = &BB;
RI->eraseFromParent();
IRB.SetInsertPoint(&BB, BB.end());
IRB.CreateBr(CondFreeBlock);
@@ -535,9 +521,8 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
// Free the previously allocate device global memory.
- const char FreeImplName[] = "free";
FunctionCallee AMDGPUFreeReturn = M.getOrInsertFunction(
- FreeImplName,
+ StringRef("free"),
FunctionType::get(IRB.getVoidTy(), {IRB.getPtrTy()}, false));
Value *MallocPtr = IRB.CreateLoad(IRB.getPtrTy(), MallocLDSGlobal);
@@ -555,14 +540,13 @@ void AMDGPUSwLowerLDS::LowerKernelLDSAccesses(Function *Func,
return;
}
-Constant *AMDGPUSwLowerLDS::GetAddressesOfVariablesInKernel(
+Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
Function *Func, SetVector<GlobalVariable *> &Variables) {
LLVMContext &Ctx = M.getContext();
Type *Int32Ty = Type::getInt32Ty(Ctx);
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
- GlobalVariable *MallocMetadataGlobal = LDSParams->MallocMetadataGlobal;
+ GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
assert(MallocMetadataGlobal);
StructType *MallocMetadataStructType =
cast<StructType>(MallocMetadataGlobal->getValueType());
@@ -571,9 +555,8 @@ Constant *AMDGPUSwLowerLDS::GetAddressesOfVariablesInKernel(
SmallVector<Constant *> Elements;
for (size_t i = 0; i < Variables.size(); i++) {
GlobalVariable *GV = Variables[i];
- assert(GV);
- if (LDSParams->LDSToReplacementIndicesMap.contains(GV)) {
- auto &Indices = LDSParams->LDSToReplacementIndicesMap[GV];
+ if (LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
uint32_t Idx0 = Indices[0];
uint32_t Idx1 = Indices[1];
uint32_t Idx2 = Indices[2];
@@ -590,13 +573,12 @@ Constant *AMDGPUSwLowerLDS::GetAddressesOfVariablesInKernel(
return ConstantArray::get(KernelOffsetsType, Elements);
}
-void AMDGPUSwLowerLDS::BuildNonKernelLDSBaseTable(
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
+ NonKernelLDSParameters &NKLDSParams) {
// Base table will have single row, with elements of the row
// placed as per kernel ID. Each element in the row corresponds
// to addresss of malloc LDS global variable of the kernel.
- auto &Kernels = NKLDSParams->OrderedKernels;
- assert(!Kernels.empty());
+ auto &Kernels = NKLDSParams.OrderedKernels;
LLVMContext &Ctx = M.getContext();
Type *Int32Ty = Type::getInt32Ty(Ctx);
const size_t NumberKernels = Kernels.size();
@@ -605,8 +587,7 @@ void AMDGPUSwLowerLDS::BuildNonKernelLDSBaseTable(
for (size_t i = 0; i < NumberKernels; i++) {
Function *Func = Kernels[i];
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
- GlobalVariable *MallocLDSGlobal = LDSParams->MallocLDSGlobal;
+ GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
assert(MallocLDSGlobal);
Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
Constant *GEP = ConstantExpr::getGetElementPtr(
@@ -616,14 +597,14 @@ void AMDGPUSwLowerLDS::BuildNonKernelLDSBaseTable(
}
Constant *init =
ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
- NKLDSParams->LDSBaseTable = new GlobalVariable(
+ NKLDSParams.LDSBaseTable = new GlobalVariable(
M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
"llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::CONSTANT_ADDRESS);
}
-void AMDGPUSwLowerLDS::BuildNonKernelLDSOffsetTable(
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
+ NonKernelLDSParameters &NKLDSParams) {
// Offset table will have multiple rows and columns.
// Rows are assumed to be from 0 to (n-1). n is total number
// of kernels accessing the LDS through non-kernels.
@@ -631,8 +612,8 @@ void AMDGPUSwLowerLDS::BuildNonKernelLDSOffsetTable(
// unique LDS globals accessed by non-kernels.
// Each element in the row correspond to the address of
// the replacement of LDS global done by that particular kernel.
- auto &Variables = NKLDSParams->OrdereLDSGlobals;
- auto &Kernels = NKLDSParams->OrderedKernels;
+ auto &Variables = NKLDSParams.OrdereLDSGlobals;
+ auto &Kernels = NKLDSParams.OrderedKernels;
assert(!Variables.empty());
assert(!Kernels.empty());
LLVMContext &Ctx = M.getContext();
@@ -649,20 +630,20 @@ void AMDGPUSwLowerLDS::BuildNonKernelLDSOffsetTable(
for (size_t i = 0; i < NumberKernels; i++) {
Function *Func = Kernels[i];
overallConstantExprElts[i] =
- GetAddressesOfVariablesInKernel(Func, Variables);
+ getAddressesOfVariablesInKernel(Func, Variables);
}
Constant *init =
ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
- NKLDSParams->LDSOffsetTable = new GlobalVariable(
+ NKLDSParams.LDSOffsetTable = new GlobalVariable(
M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
"llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::CONSTANT_ADDRESS);
return;
}
-void AMDGPUSwLowerLDS::LowerNonKernelLDSAccesses(
+void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
- std::shared_ptr<NonKernelLDSParameters> &NKLDSParams) {
+ NonKernelLDSParameters &NKLDSParams) {
// Replace LDS access in non-kernel with replacement queried from
// Base table and offset from offset table.
auto *EntryBlock = &Func->getEntryBlock();
@@ -670,9 +651,9 @@ void AMDGPUSwLowerLDS::LowerNonKernelLDSAccesses(
Function *Decl =
Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
auto *KernelId = IRB.CreateCall(Decl, {});
- GlobalVariable *LDSBaseTable = NKLDSParams->LDSBaseTable;
- GlobalVariable *LDSOffsetTable = NKLDSParams->LDSOffsetTable;
- auto &OrdereLDSGlobals = NKLDSParams->OrdereLDSGlobals;
+ GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
+ GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
+ auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
assert(LDSBaseTable && LDSOffsetTable);
Value *BaseGEP = IRB.CreateInBoundsGEP(
LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
@@ -691,36 +672,34 @@ void AMDGPUSwLowerLDS::LowerNonKernelLDSAccesses(
OffsetLoad = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
Value *BasePlusOffset =
IRB.CreateInBoundsGEP(GV->getType(), BasePtr, {OffsetLoad});
- ReplacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
return;
}
-static void ReorderStaticDynamicIndirectLDSSet(
- std::shared_ptr<KernelLDSParameters> &LDSParams) {
+static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
// Sort Static, dynamic LDS globals which are either
// direct or indirect access on basis of name.
- assert(LDSParams);
- LDSParams->DirectAccess.StaticLDSGlobals =
- SortByName(std::vector<GlobalVariable *>(
- LDSParams->DirectAccess.StaticLDSGlobals.begin(),
- LDSParams->DirectAccess.StaticLDSGlobals.end()));
- LDSParams->DirectAccess.DynamicLDSGlobals =
- SortByName(std::vector<GlobalVariable *>(
- LDSParams->DirectAccess.DynamicLDSGlobals.begin(),
- LDSParams->DirectAccess.DynamicLDSGlobals.end()));
- LDSParams->IndirectAccess.StaticLDSGlobals =
- SortByName(std::vector<GlobalVariable *>(
- LDSParams->IndirectAccess.StaticLDSGlobals.begin(),
- LDSParams->IndirectAccess.StaticLDSGlobals.end()));
- LDSParams->IndirectAccess.DynamicLDSGlobals =
- SortByName(std::vector<GlobalVariable *>(
- LDSParams->IndirectAccess.DynamicLDSGlobals.begin(),
- LDSParams->IndirectAccess.DynamicLDSGlobals.end()));
+ LDSParams.DirectAccess.StaticLDSGlobals =
+ sortByName(std::vector<GlobalVariable *>(
+ LDSParams.DirectAccess.StaticLDSGlobals.begin(),
+ LDSParams.DirectAccess.StaticLDSGlobals.end()));
+ LDSParams.DirectAccess.DynamicLDSGlobals =
+ sortByName(std::vector<GlobalVariable *>(
+ LDSParams.DirectAccess.DynamicLDSGlobals.begin(),
+ LDSParams.DirectAccess.DynamicLDSGlobals.end()));
+ LDSParams.IndirectAccess.StaticLDSGlobals =
+ sortByName(std::vector<GlobalVariable *>(
+ LDSParams.IndirectAccess.StaticLDSGlobals.begin(),
+ LDSParams.IndirectAccess.StaticLDSGlobals.end()));
+ LDSParams.IndirectAccess.DynamicLDSGlobals =
+ sortByName(std::vector<GlobalVariable *>(
+ LDSParams.IndirectAccess.DynamicLDSGlobals.begin(),
+ LDSParams.IndirectAccess.DynamicLDSGlobals.end()));
return;
}
-bool AMDGPUSwLowerLDS::Run() {
+bool AMDGPUSwLowerLDS::run() {
bool Changed = false;
CallGraph CG = CallGraph(M);
SetVector<Function *> KernelsWithIndirectLDSAccess;
@@ -733,7 +712,7 @@ bool AMDGPUSwLowerLDS::Run() {
LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
// Get the Uses of LDS from non-kernels.
- GetUsesOfLDSByNonKernels(CG, NonKernelToLDSAccessMap);
+ getUsesOfLDSByNonKernels(CG, NonKernelToLDSAccessMap);
// Utility to group LDS access into direct, indirect, static and dynamic.
auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
@@ -743,25 +722,26 @@ bool AMDGPUSwLowerLDS::Run() {
assert(isKernelLDS(F));
assert(!K.second.empty());
- if (!KernelToLDSParametersMap.contains(F))
- KernelToLDSParametersMap[F] = std::make_shared<KernelLDSParameters>();
+ if (!KernelToLDSParametersMap.contains(F)) {
+ KernelLDSParameters KernelLDSParams;
+ KernelToLDSParametersMap[F] = KernelLDSParams;
+ }
auto &LDSParams = KernelToLDSParametersMap[F];
- assert(LDSParams);
if (!DirectAccess)
KernelsWithIndirectLDSAccess.insert(F);
for (GlobalVariable *GV : K.second) {
if (!DirectAccess) {
if (AMDGPU::isDynamicLDS(*GV))
- LDSParams->IndirectAccess.DynamicLDSGlobals.insert(GV);
+ LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
else
- LDSParams->IndirectAccess.StaticLDSGlobals.insert(GV);
+ LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
AllNonKernelLDSAccess.insert(GV);
} else {
if (AMDGPU::isDynamicLDS(*GV))
- LDSParams->DirectAccess.DynamicLDSGlobals.insert(GV);
+ LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
else
- LDSParams->DirectAccess.StaticLDSGlobals.insert(GV);
+ LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
}
}
}
@@ -773,42 +753,39 @@ bool AMDGPUSwLowerLDS::Run() {
for (auto &K : KernelToLDSParametersMap) {
Function *Func = K.first;
auto &LDSParams = KernelToLDSParametersMap[Func];
- assert(LDSParams);
- if (LDSParams->DirectAccess.StaticLDSGlobals.empty() &&
- LDSParams->DirectAccess.DynamicLDSGlobals.empty() &&
- LDSParams->IndirectAccess.StaticLDSGlobals.empty() &&
- LDSParams->IndirectAccess.DynamicLDSGlobals.empty()) {
+ if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
+ LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
Changed = false;
} else {
- ReorderStaticDynamicIndirectLDSSet(LDSParams);
- PopulateMallocLDSGlobal(Func);
- PopulateMallocMetadataGlobal(Func);
- PopulateLDSToReplacementIndicesMap(Func);
+ reorderStaticDynamicIndirectLDSSet(LDSParams);
+ populateMallocLDSGlobal(Func);
+ populateMallocMetadataGlobal(Func);
+ populateLDSToReplacementIndicesMap(Func);
DomTreeUpdater DTU(DTCallback(*Func),
DomTreeUpdater::UpdateStrategy::Lazy);
- LowerKernelLDSAccesses(Func, DTU);
+ lowerKernelLDSAccesses(Func, DTU);
Changed = true;
}
}
- std::shared_ptr<NonKernelLDSParameters> NKLDSParams =
- std::make_shared<NonKernelLDSParameters>();
+ NonKernelLDSParameters NKLDSParams;
if (!NonKernelToLDSAccessMap.empty()) {
- assert(NKLDSParams);
- NKLDSParams->OrderedKernels = GetOrderedIndirectLDSAccessingKernels(
+ NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
std::move(KernelsWithIndirectLDSAccess));
- NKLDSParams->OrdereLDSGlobals =
- GetOrderedNonKernelAllLDSGlobals(std::move(AllNonKernelLDSAccess));
- assert(!NKLDSParams->OrderedKernels.empty());
- assert(!NKLDSParams->OrdereLDSGlobals.empty());
- BuildNonKernelLDSBaseTable(NKLDSParams);
- BuildNonKernelLDSOffsetTable(NKLDSParams);
+ NKLDSParams.OrdereLDSGlobals =
+ getOrderedNonKernelAllLDSGlobals(std::move(AllNonKernelLDSAccess));
+ assert(!NKLDSParams.OrderedKernels.empty());
+ assert(!NKLDSParams.OrdereLDSGlobals.empty());
+ buildNonKernelLDSBaseTable(NKLDSParams);
+ buildNonKernelLDSOffsetTable(NKLDSParams);
for (auto &K : NonKernelToLDSAccessMap) {
Function *Func = K.first;
DenseSet<GlobalVariable *> &LDSGlobals = K.second;
- SetVector<GlobalVariable *> OrderedLDSGlobals = SortByName(
+ SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
- LowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
+ lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
}
}
return Changed;
@@ -839,7 +816,7 @@ bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
};
bool IsChanged = false;
AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
- IsChanged |= SwLowerLDSImpl.Run();
+ IsChanged |= SwLowerLDSImpl.run();
return IsChanged;
}
@@ -855,7 +832,7 @@ PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
};
bool IsChanged = false;
AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
- IsChanged |= SwLowerLDSImpl.Run();
+ IsChanged |= SwLowerLDSImpl.run();
if (!IsChanged)
return PreservedAnalyses::all();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
index 3bc0ac00b5ccaf..ca38b0d07801e0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -2,12 +2,10 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check indirect dynamic LDS access through a non-kernel from kernel is lowered correctly.
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
- at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
- at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
define void @use_variables() {
; CHECK-LABEL: define void @use_variables() {
@@ -56,9 +54,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
@@ -68,7 +64,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 21:
+; CHECK: 19:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
index 28a8cd62053078..eb0f869c6a5d66 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -1,18 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check if direct access of dynamic LDS in kernel is lowered correctly.
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [0 x i8] poison, align 4
- at lds_2 = internal unnamed_addr addrspace(3) global [0 x i32] poison, align 8
+ at lds_1 = external addrspace(3) global [0 x i8]
+ at lds_2 = external addrspace(3) global [0 x i8]
-;.
-; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [0 x i8] poison, align 4
-; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [0 x i32] poison, align 8
-; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
-; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type zeroinitializer, no_sanitize_address, align 8
-;.
define amdgpu_kernel void @k0() {
; CHECK-LABEL: define amdgpu_kernel void @k0() {
; CHECK-NEXT: WId:
@@ -24,18 +16,34 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 0)
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i32 15
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 8
+; CHECK-NEXT: store i64 [[TMP22]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 0, [[TMP22]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP23]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], 1
+; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: store i64 [[TMP26]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP26]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 19:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 8
+; CHECK-NEXT: store i8 8, ptr addrspace(3) [[TMP11]], align 8
; CHECK-NEXT: br label [[CONDFREE:%.*]]
; CHECK: CondFree:
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
@@ -48,10 +56,6 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: ret void
;
store i8 7, ptr addrspace(3) @lds_1, align 4
- store i32 8, ptr addrspace(3) @lds_2, align 8
+ store i8 8, ptr addrspace(3) @lds_2, align 8
ret void
}
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
index cb5132770d7c1b..db557003dfe5e5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -2,13 +2,10 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check when multiple kernels access the same non-kernel, LDS accesses are lowere correctly.
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
- at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
- at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
define void @use_variables_1() {
; CHECK-LABEL: define void @use_variables_1() {
@@ -83,9 +80,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
@@ -95,7 +90,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 21:
+; CHECK: 19:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
@@ -139,18 +134,14 @@ define amdgpu_kernel void @k1() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP27:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP27]], i32 15
-; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP28]], align 8
+; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP26]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP29]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP26]], [[TMP29]]
; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP30]], 7
; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP33]], 8
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i32 15
-; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP22]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 1), align 8
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP19]], [[TMP22]]
@@ -160,7 +151,7 @@ define amdgpu_kernel void @k1() {
; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
; CHECK-NEXT: br label [[TMP14]]
-; CHECK: 28:
+; CHECK: 24:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
index 9b92999392c388..43cc4f8b945ca8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -4,11 +4,8 @@
; Test to check malloc and free blocks are placed correctly when multiple
; blocks and branching is present in the function with LDS accesses lowered correctly.
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
-
- at lds_1 = internal unnamed_addr addrspace(3) global i32 poison
- at lds_2 = internal unnamed_addr addrspace(3) global i32 poison
+ at lds_1 = internal addrspace(3) global i32 poison
+ at lds_2 = internal addrspace(3) global i32 poison
define amdgpu_kernel void @test_kernel() {
; CHECK-LABEL: define amdgpu_kernel void @test_kernel() {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
index c7e548f031cbd7..460106b08551bb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -3,13 +3,10 @@
; Test to check if static and dynamic LDS accesses are lowered correctly when a non-kernel
; is called from kernel.
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
- at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
- at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
define void @use_variables() {
; CHECK-LABEL: define void @use_variables() {
@@ -58,9 +55,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
@@ -70,7 +65,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 21:
+; CHECK: 19:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
index 7b4c70a1fa1120..2fad006b0f22d8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -2,19 +2,16 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check if static and dynamic LDS accesses are lowered correctly in kernel.
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
- at lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
- at lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
;.
-; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
-; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
-; CHECK: @lds_3 = external unnamed_addr addrspace(3) global [0 x i8], align 4
-; CHECK: @lds_4 = external unnamed_addr addrspace(3) global [0 x i8], align 8
+; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+; CHECK: @lds_3 = external addrspace(3) global [0 x i8], align 4
+; CHECK: @lds_4 = external addrspace(3) global [0 x i8], align 8
; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 } }, no_sanitize_address, align 8
;.
@@ -38,9 +35,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP13]], i32 15
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
@@ -50,7 +45,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 21:
+; CHECK: 19:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
index c5a9ddde58504a..c35834b3d1a5d1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -4,9 +4,6 @@
; Test to check if LDS accesses are lowered correctly when LDS is passed as function
; argument to non-kernel.
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
-
@lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
define void @my_function(ptr addrspace(3) %lds_arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
index 535ee62bee00d6..5743862aa51a03 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -2,12 +2,10 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel.
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 1
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 2
- at lds_3 = external unnamed_addr addrspace(3) global [3 x i8], align 4
- at lds_4 = external unnamed_addr addrspace(3) global [4 x i8], align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [3 x i8], align 4
+ at lds_4 = external addrspace(3) global [4 x i8], align 8
define void @use_variables() {
; CHECK-LABEL: define void @use_variables() {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
index 5904ab8167dd12..6703a74cb99112 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -2,15 +2,12 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-- | FileCheck %s
; Test to check if static LDS accesses in kernel are lowered correctly.
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-unknown-unknown"
- at lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
- at lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
;.
-; CHECK: @lds_1 = internal unnamed_addr addrspace(3) global [1 x i8] poison, align 4
-; CHECK: @lds_2 = internal unnamed_addr addrspace(3) global [1 x i32] poison, align 8
+; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 } }, no_sanitize_address, align 8
;.
>From ce56b0ce70a711ce0ff2862401a46cb428bf6f2a Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Tue, 9 Apr 2024 17:07:08 +0530
Subject: [PATCH 03/11] [AMDGPU] Update patch based on review comments:2
---
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 44 +---------
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 9 +-
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp | 83 +++++++++++++++++++
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.h | 5 ++
4 files changed, 96 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 2c7163a7753725..625ac0230f1606 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -862,48 +862,6 @@ class AMDGPULowerModuleLDS {
return N;
}
- /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
- /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
- /// the lack of llvm.amdgcn.lds.kernel.id calls.
- void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
- KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
-
- SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
- SmallPtrSet<Function *, 8> Visited;
- bool SeenUnknownCall = false;
-
- while (!WorkList.empty()) {
- Function *F = WorkList.pop_back_val();
-
- for (auto &CallRecord : *CG[F]) {
- if (!CallRecord.second)
- continue;
-
- Function *Callee = CallRecord.second->getFunction();
- if (!Callee) {
- if (!SeenUnknownCall) {
- SeenUnknownCall = true;
-
- // If we see any indirect calls, assume nothing about potential
- // targets.
- // TODO: This could be refined to possible LDS global users.
- for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
- Function *PotentialCallee =
- ExternalCallRecord.second->getFunction();
- assert(PotentialCallee);
- if (!isKernelLDS(PotentialCallee))
- PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
- }
- }
- } else {
- Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
- if (Visited.insert(Callee).second)
- WorkList.push_back(Callee);
- }
- }
- }
- }
-
DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1059,7 +1017,7 @@ class AMDGPULowerModuleLDS {
//
// TODO: We could filter out subgraphs that do not access LDS globals.
for (Function *F : KernelsThatAllocateTableLDS)
- removeNoLdsKernelIdFromReachable(CG, F);
+ removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
}
DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 6de02b178af9a7..55452c84b4afbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -303,10 +303,12 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
StructType *MetadataStructType =
StructType::create(Ctx, Items, MDTypeOS.str());
+ SmallString<128> MDStr;
+ raw_svector_ostream MDOS(MDStr);
+ MDOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md";
LDSParams.MallocMetadataGlobal = new GlobalVariable(
M, MetadataStructType, false, GlobalValue::InternalLinkage,
- PoisonValue::get(MetadataStructType),
- ("llvm.amdgcn.sw.lds." + Func->getName().str() + ".md"), nullptr,
+ PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
LDSParams.MallocMetadataGlobal->setInitializer(data);
@@ -759,6 +761,9 @@ bool AMDGPUSwLowerLDS::run() {
LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
Changed = false;
} else {
+ removeFnAttrFromReachable(CG, Func, "amdgpu-no-workitem-id-x");
+ removeFnAttrFromReachable(CG, Func, "amdgpu-no-workitem-id-y");
+ removeFnAttrFromReachable(CG, Func, "amdgpu-no-workitem-id-z");
reorderStaticDynamicIndirectLDSSet(LDSParams);
populateMallocLDSGlobal(Func);
populateMallocMetadataGlobal(Func);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 26b3819f7fd566..1fbf8e3fb7d061 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -241,6 +241,89 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
}
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+ StringRef FnAttr) {
+ KernelRoot->removeFnAttr(FnAttr);
+
+ SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
+ if (!Tmp.back())
+ return;
+
+ SmallPtrSet<Function *, 8> Visited;
+ bool SeenUnknownCall = false;
+
+ do {
+ Function *F = Tmp.pop_back_val();
+
+ for (auto &N : *CG[F]) {
+ if (!N.second)
+ continue;
+
+ Function *Callee = N.second->getFunction();
+ if (!Callee) {
+ if (!SeenUnknownCall) {
+ SeenUnknownCall = true;
+
+ // If we see any indirect calls, assume nothing about potential
+ // targets.
+ // TODO: This could be refined to possible LDS global users.
+ for (auto &N : *CG.getExternalCallingNode()) {
+ Function *PotentialCallee = N.second->getFunction();
+ if (!isKernelLDS(PotentialCallee))
+ PotentialCallee->removeFnAttr(FnAttr);
+ }
+
+ continue;
+ }
+ }
+
+ Callee->removeFnAttr(FnAttr);
+ if (Visited.insert(Callee).second)
+ Tmp.push_back(Callee);
+ }
+ } while (!Tmp.empty());
+}
+
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+ StringRef FnAttr) {
+ KernelRoot->removeFnAttr(FnAttr);
+
+ SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
+ SmallPtrSet<Function *, 8> Visited;
+ bool SeenUnknownCall = false;
+
+ while (!WorkList.empty()) {
+ Function *F = WorkList.pop_back_val();
+
+ for (auto &CallRecord : *CG[F]) {
+ if (!CallRecord.second)
+ continue;
+
+ Function *Callee = CallRecord.second->getFunction();
+ if (!Callee) {
+ if (!SeenUnknownCall) {
+ SeenUnknownCall = true;
+
+ // If we see any indirect calls, assume nothing about potential
+ // targets.
+ // TODO: This could be refined to possible LDS global users.
+ for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
+ Function *PotentialCallee =
+ ExternalCallRecord.second->getFunction();
+ assert(PotentialCallee);
+ if (!isKernelLDS(PotentialCallee))
+ PotentialCallee->removeFnAttr(FnAttr);
+ }
+ }
+ } else {
+ Callee->removeFnAttr(FnAttr);
+ if (Visited.insert(Callee).second)
+ WorkList.push_back(Callee);
+ }
+ }
+ }
+}
+
bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
Instruction *DefInst = Def->getMemoryInst();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index a199b927a28b6c..1df9c28e65888c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -51,6 +51,11 @@ bool isKernelLDS(const Function *F);
LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M);
+/// Strip FnAttr attribute from any functions where we may have
+/// introduced its use.
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+ StringRef FnAttr);
+
/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
/// if this is actually a memory update or an artificial clobber to facilitate
/// ordering constraints.
>From 2776a7e2a9019af3614b07acf2cef9a59d3fbae1 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Sun, 14 Apr 2024 00:01:05 +0530
Subject: [PATCH 04/11] [AMDGPU] Calculate Malloc size from metadata global
when dynamic LDS accesses are present
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 97 ++++++++++++-------
...pu-sw-lower-lds-dynamic-indirect-access.ll | 27 +++---
.../amdgpu-sw-lower-lds-dynamic-lds-test.ll | 20 ++--
...ds-multi-static-dynamic-indirect-access.ll | 64 ++++++------
...gpu-sw-lower-lds-multiple-blocks-return.ll | 9 +-
...ower-lds-static-dynamic-indirect-access.ll | 27 +++---
...pu-sw-lower-lds-static-dynamic-lds-test.ll | 31 +++---
...s-static-indirect-access-function-param.ll | 7 +-
...lower-lds-static-indirect-access-nested.ll | 34 ++++---
...gpu-sw-lower-lds-static-indirect-access.ll | 9 +-
.../amdgpu-sw-lower-lds-static-lds-test.ll | 13 ++-
11 files changed, 201 insertions(+), 137 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 55452c84b4afbd..aa5bd27b9624dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -29,15 +29,16 @@
// It is of struct type, with n members. n equals the number of LDS
// globals accessed by the kernel(direct and indirect). Each member of
// struct is another struct of type {i32, i32}. First member corresponds
-// to offset, second member corresponds to size of LDS global being
-// replaced. It will have name "llvm.amdgcn.sw.lds.<kernel-name>.md".
-// This global will have an intializer with static LDS related offsets
-// and sizes initialized. But for dynamic LDS related entries, offsets
-// will be intialized to previous static LDS allocation end offset. Sizes
-// for them will be zero initially. These dynamic LDS offset and size
-// values will be updated with in the kernel, since kernel can read the
-// dynamic LDS size allocation done at runtime with query to
-// "hidden_dynamic_lds_size" hidden kernel argument.
+// to offset, second member corresponds to aligned size of LDS global
+// being replaced. It will have name
+// "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have an
+// intializer with static LDS related offsets and sizes initialized. But
+// for dynamic LDS related entries, offsets will be intialized to
+// previous static LDS allocation end offset. Sizes for them will be zero
+// initially. These dynamic LDS offset and size values will be updated
+// with in the kernel, since kernel can read the dynamic LDS size
+// allocation done at runtime with query to "hidden_dynamic_lds_size"
+// hidden kernel argument.
//
// LDS accesses within the kernel will be replaced by "gep" ptr to
// corresponding offset into allocated device global memory for the kernel.
@@ -267,7 +268,7 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
UpdateMaxAlignment(GV);
- //{StartOffset, SizeInBytes}
+ //{StartOffset, AlignedSizeInBytes}
SmallString<128> MDItemStr;
raw_svector_ostream MDItemOS(MDItemStr);
MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md.item";
@@ -283,11 +284,12 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
Items.push_back(LDSItemTy);
Constant *ItemStartOffset =
ConstantInt::get(Int32Ty, LDSParams.MallocSize);
- Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
+ Constant *AlignedSizeInBytesConst =
+ ConstantInt::get(Int32Ty, AlignedSize);
LDSParams.MallocSize += AlignedSize;
- Constant *InitItem =
- ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst});
+ Constant *InitItem = ConstantStruct::get(
+ LDSItemTy, {ItemStartOffset, AlignedSizeInBytesConst});
Initializers.push_back(InitItem);
}
};
@@ -312,7 +314,9 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
LDSParams.MallocMetadataGlobal->setInitializer(data);
- LDSParams.MallocMetadataGlobal->setAlignment(MaxAlignment);
+ assert(LDSParams.MallocLDSGlobal);
+ // Set the alignment to MaxAlignment for MallocLDSGlobal.
+ LDSParams.MallocLDSGlobal->setAlignment(MaxAlignment);
GlobalValue::SanitizerMetadata MD;
MD.NoAddress = true;
LDSParams.MallocMetadataGlobal->setSanitizerMetadata(MD);
@@ -420,8 +424,9 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
auto *const XYZOr = IRB.CreateOr(XYOr, WIdz);
auto *const WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
+ GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
- assert(MallocMetadataGlobal);
+ assert(MallocLDSGlobal && MallocMetadataGlobal);
StructType *MetadataStructType =
cast<StructType>(MallocMetadataGlobal->getValueType());
@@ -436,38 +441,67 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
// Get the size of dyn lds from hidden dyn_lds_size kernel arg.
// Update the corresponding metadata global entries for this dyn lds global.
uint32_t MallocSize = LDSParams.MallocSize;
- Value *CurrMallocSize = IRB.getInt64(MallocSize);
- if (!LDSParams.DirectAccess.DynamicLDSGlobals.empty() ||
- !LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
- unsigned MaxAlignment = MallocMetadataGlobal->getAlignment();
+ Value *CurrMallocSize;
+
+ unsigned NumStaticLDS = LDSParams.DirectAccess.StaticLDSGlobals.size() +
+ LDSParams.IndirectAccess.StaticLDSGlobals.size();
+ unsigned NumDynLDS = LDSParams.DirectAccess.DynamicLDSGlobals.size() +
+ LDSParams.IndirectAccess.DynamicLDSGlobals.size();
+
+ if (NumStaticLDS) {
+ auto *GEPForEndStaticLDSOffset = IRB.CreateInBoundsGEP(
+ MetadataStructType, MallocMetadataGlobal,
+ {IRB.getInt32(0), IRB.getInt32(NumStaticLDS - 1), IRB.getInt32(0)});
+
+ auto *GEPForEndStaticLDSSize = IRB.CreateInBoundsGEP(
+ MetadataStructType, MallocMetadataGlobal,
+ {IRB.getInt32(0), IRB.getInt32(NumStaticLDS - 1), IRB.getInt32(1)});
+
+ Value *EndStaticLDSOffset =
+ IRB.CreateLoad(IRB.getInt64Ty(), GEPForEndStaticLDSOffset);
+ Value *EndStaticLDSSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), GEPForEndStaticLDSSize);
+ CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
+ } else
+ CurrMallocSize = IRB.getInt64(MallocSize);
+
+ if (NumDynLDS) {
+ unsigned MaxAlignment = MallocLDSGlobal->getAlignment();
Value *MaxAlignValue = IRB.getInt64(MaxAlignment);
Value *MaxAlignValueMinusOne = IRB.getInt64(MaxAlignment - 1);
+
Value *ImplicitArg =
IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
ImplicitArg->getType(), ImplicitArg, {IRB.getInt32(15)});
+
auto MallocSizeCalcLambda =
[&](SetVector<GlobalVariable *> &DynamicLDSGlobals) {
for (GlobalVariable *DynGV : DynamicLDSGlobals) {
auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
- // Get size from hidden dyn_lds_size argument of kernel int
- // CurrDynLDSSize
- Value *CurrDynLDSSize =
- IRB.CreateLoad(IRB.getInt64Ty(), HiddenDynLDSSize);
+
+ // Update the Offset metadata.
auto *GEPForOffset = IRB.CreateInBoundsGEP(
MetadataStructType, MallocMetadataGlobal,
{IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(0)});
IRB.CreateStore(CurrMallocSize, GEPForOffset);
+ // Get size from hidden dyn_lds_size argument of kernel
+ // Update the Aligned Size metadata.
auto *GEPForSize = IRB.CreateInBoundsGEP(
MetadataStructType, MallocMetadataGlobal,
{IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(1)});
- IRB.CreateStore(CurrDynLDSSize, GEPForSize);
- CurrMallocSize = IRB.CreateAdd(CurrMallocSize, CurrDynLDSSize);
- CurrMallocSize =
- IRB.CreateAdd(CurrMallocSize, MaxAlignValueMinusOne);
- CurrMallocSize = IRB.CreateUDiv(CurrMallocSize, MaxAlignValue);
- CurrMallocSize = IRB.CreateMul(CurrMallocSize, MaxAlignValue);
+ Value *CurrDynLDSSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), HiddenDynLDSSize);
+ Value *AlignedDynLDSSize =
+ IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
+ AlignedDynLDSSize =
+ IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
+ AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
+ IRB.CreateStore(AlignedDynLDSSize, GEPForSize);
+
+ // Update the Current Malloc Size
+ CurrMallocSize = IRB.CreateAdd(CurrMallocSize, AlignedDynLDSSize);
}
};
MallocSizeCalcLambda(LDSParams.DirectAccess.DynamicLDSGlobals);
@@ -481,10 +515,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
FunctionType::get(IRB.getPtrTy(1), {IRB.getInt64Ty()}, false));
Value *MCI = IRB.CreateCall(AMDGPUMallocReturn, {CurrMallocSize});
- GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
- assert(MallocLDSGlobal);
-
- // create load of malloc to new global
+ // create store of malloc to new global
IRB.CreateStore(MCI, MallocLDSGlobal);
// Create branch to PrevEntryBlock
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
index ca38b0d07801e0..b154bfea786305 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -45,26 +45,29 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = udiv i64 [[TMP19]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP28]], 8
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP27]], [[TMP8]]
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP18]], 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP16]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 19:
+; CHECK: 22:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
index eb0f869c6a5d66..f6f3fed2f6f5fe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -18,20 +18,20 @@ define amdgpu_kernel void @k0() {
; CHECK: Malloc:
; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i32 15
-; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 8
+; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 1
+; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP18]], 1
; CHECK-NEXT: store i64 [[TMP22]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 0, i32 1), align 8
; CHECK-NEXT: [[TMP23:%.*]] = add i64 0, [[TMP22]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP23]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], 1
-; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP14]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
-; CHECK-NEXT: store i64 [[TMP26]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP23]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(4) [[TMP21]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = udiv i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP24]], 1
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
-; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP26]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP23]], [[TMP15]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP7]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
index db557003dfe5e5..fd45f3d8c346eb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -71,26 +71,29 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 8, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP25]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = udiv i64 [[TMP19]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP26]], 8
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 8, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP25]], [[TMP8]]
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP18]], 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP16]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 19:
+; CHECK: 22:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
@@ -125,33 +128,36 @@ define amdgpu_kernel void @k1() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
; CHECK: Malloc:
+; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP20]], [[TMP21]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = udiv i64 [[TMP19]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP24]], 8
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 8
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
-; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP27]], [[TMP8]]
; CHECK-NEXT: store i64 [[TMP26]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP28]], 8
+; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP18]], 8
; CHECK-NEXT: store i64 [[TMP29]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP26]], [[TMP29]]
-; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP30]], 7
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP33]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP30]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = udiv i64 [[TMP34]], 8
+; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP23]], 8
; CHECK-NEXT: store i64 [[TMP22]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 1), align 8
-; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP19]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 7
-; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP30]], [[TMP22]]
; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
; CHECK-NEXT: br label [[TMP14]]
-; CHECK: 24:
+; CHECK: 27:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
index 43cc4f8b945ca8..101c3af4ff9676 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -18,15 +18,18 @@ define amdgpu_kernel void @test_kernel() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 8)
+; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP17]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, align 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr addrspace(1)
; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
index 460106b08551bb..a17fa9df89b323 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -46,26 +46,29 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = udiv i64 [[TMP19]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP28]], 8
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP27]], [[TMP8]]
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP18]], 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP16]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 19:
+; CHECK: 22:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
index 2fad006b0f22d8..90100b57a0c291 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -12,8 +12,8 @@
; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
; CHECK: @lds_3 = external addrspace(3) global [0 x i8], align 4
; CHECK: @lds_4 = external addrspace(3) global [0 x i8], align 8
-; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
-; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 } }, no_sanitize_address, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 8 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 16, i32 0 } }, no_sanitize_address
;.
define amdgpu_kernel void @k0() {
; CHECK-LABEL: define amdgpu_kernel void @k0() {
@@ -26,26 +26,29 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
-; CHECK-NEXT: store i64 16, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: store i64 [[TMP31]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = udiv i64 [[TMP19]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP32]], 8
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 8
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 16, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 7
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP31]], [[TMP8]]
; CHECK-NEXT: store i64 [[TMP12]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP33]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP18]], 8
; CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP12]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 7
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 [[TMP17]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP19]])
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP16]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP21]]
-; CHECK: 19:
+; CHECK: 22:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
index c35834b3d1a5d1..214fa7f1a9cb00 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -31,10 +31,13 @@ define amdgpu_kernel void @my_kernel() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 4096)
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, align 8
+; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP13]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
index 0a34427dd08f8f..47188f252eee11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
@@ -18,10 +18,13 @@ define amdgpu_kernel void @kernel_0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 64)
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP11]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @call_store_A()
@@ -54,14 +57,14 @@ define amdgpu_kernel void @kernel_1() {
; CHECK: Malloc:
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 [[TMP12]], 4
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 4
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 1), align 8
; CHECK-NEXT: [[TMP9:%.*]] = add i64 0, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 3
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP9]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
; CHECK-NEXT: br label [[TMP14]]
; CHECK: 14:
@@ -95,10 +98,13 @@ define amdgpu_kernel void @kernel_2() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 64)
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 0, i32 1), align 8
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP11]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @store_A()
@@ -131,14 +137,14 @@ define amdgpu_kernel void @kernel_3() {
; CHECK: Malloc:
; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i32 15
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
; CHECK-NEXT: store i64 0, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 8
+; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(4) [[TMP7]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 [[TMP12]], 4
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 4
; CHECK-NEXT: store i64 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 1), align 8
; CHECK-NEXT: [[TMP9:%.*]] = add i64 0, [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 3
-; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP12]])
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP9]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
; CHECK-NEXT: br label [[TMP14]]
; CHECK: 14:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
index 5743862aa51a03..aa93f76e4b5463 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -48,15 +48,18 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 32)
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 8
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP15]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: call void @use_variables()
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
index 6703a74cb99112..733f9cfab1e820 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -8,8 +8,8 @@
;.
; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
-; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison
-; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 1 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 4 } }, no_sanitize_address, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, align 8
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 8, i32 8 } }, no_sanitize_address
;.
define amdgpu_kernel void @k0() {
; CHECK-LABEL: define amdgpu_kernel void @k0() {
@@ -22,15 +22,18 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
; CHECK: Malloc:
-; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 16)
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 8
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @malloc(i64 [[TMP15]])
; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 7:
+; CHECK: 10:
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 2
>From cd26f2819f480f4c1e2a0cbb02b78372c34acf53 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Mon, 15 Apr 2024 15:43:03 +0530
Subject: [PATCH 05/11] [AMDGPU] Rename malloc LDS globals to Sw LDS globals
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 119 ++++++++++----------
1 file changed, 59 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index aa5bd27b9624dc..75542100e2729d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -16,13 +16,13 @@
// allocated. At the prologue of the kernel, a single work-item from the
// work-group, does a "malloc" and stores the pointer of the allocation in
// new LDS global that will be created for the kernel. This will be called
-// "malloc LDS global" in this pass.
+// "SW LDS" in this pass.
// Each LDS access corresponds to an offset in the allocated memory.
// All static LDS accesses will be allocated first and then dynamic LDS
// will occupy the device global memory.
// To store the offsets corresponding to all LDS accesses, another global
-// variable is created which will be called "metadata global" in this pass.
-// - Malloc LDS Global:
+// variable is created which will be called "SW LDS metadata" in this pass.
+// - SW LDS Global:
// It is LDS global of ptr type with name
// "llvm.amdgcn.sw.lds.<kernel-name>".
// - Metadata Global:
@@ -53,7 +53,7 @@
// - Base table:
// Base table will have single row, with elements of the row
// placed as per kernel ID. Each element in the row corresponds
-// to addresss of "malloc LDS global" variable created for
+// to addresss of "SW LDS" variable created for
// that kernel.
// - Offset table:
// Offset table will have multiple rows and columns.
@@ -64,9 +64,9 @@
// Each element in the row correspond to the address of
// the replacement of LDS global done by that particular kernel.
// A LDS variable in non-kernel will be replaced based on the information
-// from base and offset tables. Based on kernel-id query, address of "malloc
-// LDS global" for that corresponding kernel is obtained from base table.
-// The Offset into the base "malloc LDS global" is obtained from
+// from base and offset tables. Based on kernel-id query, address of "SW
+// LDS" for that corresponding kernel is obtained from base table.
+// The Offset into the base "SW LDS" is obtained from
// corresponding element in offset table. With this information, replacement
// value is obtained.
//===----------------------------------------------------------------------===//
@@ -112,8 +112,8 @@ struct LDSAccessTypeInfo {
// to replace a LDS global uses with corresponding offset
// in to device global memory.
struct KernelLDSParameters {
- GlobalVariable *MallocLDSGlobal{nullptr};
- GlobalVariable *MallocMetadataGlobal{nullptr};
+ GlobalVariable *SwLDS{nullptr};
+ GlobalVariable *SwLDSMetadata{nullptr};
LDSAccessTypeInfo DirectAccess;
LDSAccessTypeInfo IndirectAccess;
DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
@@ -142,8 +142,8 @@ class AMDGPUSwLowerLDS {
getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
SetVector<GlobalVariable *>
getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &&Variables);
- void populateMallocLDSGlobal(Function *Func);
- void populateMallocMetadataGlobal(Function *Func);
+ void populateSwLDSGlobal(Function *Func);
+ void populateSwMetadataGlobal(Function *Func);
void populateLDSToReplacementIndicesMap(Function *Func);
void replaceKernelLDSAccesses(Function *Func);
void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
@@ -229,19 +229,19 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels(
}
}
-void AMDGPUSwLowerLDS::populateMallocLDSGlobal(Function *Func) {
+void AMDGPUSwLowerLDS::populateSwLDSGlobal(Function *Func) {
// Create new LDS global required for each kernel to store
// device global memory pointer.
auto &LDSParams = KernelToLDSParametersMap[Func];
// create new global pointer variable
- LDSParams.MallocLDSGlobal = new GlobalVariable(
+ LDSParams.SwLDS = new GlobalVariable(
M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
return;
}
-void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
+void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
// Create new metadata global for every kernel and initialize the
// start offsets and sizes corresponding to each LDS accesses.
auto &LDSParams = KernelToLDSParametersMap[Func];
@@ -276,8 +276,8 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
StructType *LDSItemTy =
StructType::create(Ctx, {Int32Ty, Int32Ty}, MDItemOS.str());
- auto buildInitializerForMallocMDGlobal = [&](SetVector<GlobalVariable *>
- &LDSGlobals) {
+ auto buildInitializerForSwLDSMD = [&](SetVector<GlobalVariable *>
+ &LDSGlobals) {
for (auto &GV : LDSGlobals) {
Type *Ty = GV->getValueType();
const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
@@ -294,10 +294,10 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
}
};
- buildInitializerForMallocMDGlobal(LDSParams.DirectAccess.StaticLDSGlobals);
- buildInitializerForMallocMDGlobal(LDSParams.IndirectAccess.StaticLDSGlobals);
- buildInitializerForMallocMDGlobal(LDSParams.DirectAccess.DynamicLDSGlobals);
- buildInitializerForMallocMDGlobal(LDSParams.IndirectAccess.DynamicLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
SmallString<128> MDTypeStr;
raw_svector_ostream MDTypeOS(MDTypeStr);
@@ -308,18 +308,18 @@ void AMDGPUSwLowerLDS::populateMallocMetadataGlobal(Function *Func) {
SmallString<128> MDStr;
raw_svector_ostream MDOS(MDStr);
MDOS << "llvm.amdgcn.sw.lds." << Func->getName().str() << ".md";
- LDSParams.MallocMetadataGlobal = new GlobalVariable(
+ LDSParams.SwLDSMetadata = new GlobalVariable(
M, MetadataStructType, false, GlobalValue::InternalLinkage,
PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
- LDSParams.MallocMetadataGlobal->setInitializer(data);
- assert(LDSParams.MallocLDSGlobal);
- // Set the alignment to MaxAlignment for MallocLDSGlobal.
- LDSParams.MallocLDSGlobal->setAlignment(MaxAlignment);
+ LDSParams.SwLDSMetadata->setInitializer(data);
+ assert(LDSParams.SwLDS);
+ // Set the alignment to MaxAlignment for SwLDS.
+ LDSParams.SwLDS->setAlignment(MaxAlignment);
GlobalValue::SanitizerMetadata MD;
MD.NoAddress = true;
- LDSParams.MallocMetadataGlobal->setSanitizerMetadata(MD);
+ LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
return;
}
@@ -360,12 +360,12 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
auto &LDSParams = KernelToLDSParametersMap[Func];
- GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
- assert(MallocLDSGlobal);
- GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
- assert(MallocMetadataGlobal);
- StructType *MallocMetadataStructType =
- cast<StructType>(MallocMetadataGlobal->getValueType());
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ assert(SwLDS);
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDSMetadata);
+ StructType *SwLDSMetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
Type *Int32Ty = IRB.getInt32Ty();
// Replace all uses of LDS global in this Function with a Replacement.
@@ -387,10 +387,10 @@ void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
ConstantInt::get(Int32Ty, Idx1),
ConstantInt::get(Int32Ty, Idx2)};
Constant *GEP = ConstantExpr::getGetElementPtr(
- MallocMetadataStructType, MallocMetadataGlobal, GEPIdx, true);
+ SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
Value *Load = IRB.CreateLoad(Int32Ty, GEP);
Value *BasePlusOffset =
- IRB.CreateInBoundsGEP(GV->getType(), MallocLDSGlobal, {Load});
+ IRB.CreateInBoundsGEP(GV->getType(), SwLDS, {Load});
replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
};
@@ -424,11 +424,11 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
auto *const XYZOr = IRB.CreateOr(XYOr, WIdz);
auto *const WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
- GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
- GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
- assert(MallocLDSGlobal && MallocMetadataGlobal);
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDS && SwLDSMetadata);
StructType *MetadataStructType =
- cast<StructType>(MallocMetadataGlobal->getValueType());
+ cast<StructType>(SwLDSMetadata->getValueType());
// All work items will branch to PrevEntryBlock except {0,0,0} index
// work item which will branch to malloc block.
@@ -450,11 +450,11 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
if (NumStaticLDS) {
auto *GEPForEndStaticLDSOffset = IRB.CreateInBoundsGEP(
- MetadataStructType, MallocMetadataGlobal,
+ MetadataStructType, SwLDSMetadata,
{IRB.getInt32(0), IRB.getInt32(NumStaticLDS - 1), IRB.getInt32(0)});
auto *GEPForEndStaticLDSSize = IRB.CreateInBoundsGEP(
- MetadataStructType, MallocMetadataGlobal,
+ MetadataStructType, SwLDSMetadata,
{IRB.getInt32(0), IRB.getInt32(NumStaticLDS - 1), IRB.getInt32(1)});
Value *EndStaticLDSOffset =
@@ -466,7 +466,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
CurrMallocSize = IRB.getInt64(MallocSize);
if (NumDynLDS) {
- unsigned MaxAlignment = MallocLDSGlobal->getAlignment();
+ unsigned MaxAlignment = SwLDS->getAlignment();
Value *MaxAlignValue = IRB.getInt64(MaxAlignment);
Value *MaxAlignValueMinusOne = IRB.getInt64(MaxAlignment - 1);
@@ -482,14 +482,14 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
// Update the Offset metadata.
auto *GEPForOffset = IRB.CreateInBoundsGEP(
- MetadataStructType, MallocMetadataGlobal,
+ MetadataStructType, SwLDSMetadata,
{IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(0)});
IRB.CreateStore(CurrMallocSize, GEPForOffset);
// Get size from hidden dyn_lds_size argument of kernel
// Update the Aligned Size metadata.
auto *GEPForSize = IRB.CreateInBoundsGEP(
- MetadataStructType, MallocMetadataGlobal,
+ MetadataStructType, SwLDSMetadata,
{IRB.getInt32(0), IRB.getInt32(Indices[1]), IRB.getInt32(1)});
Value *CurrDynLDSSize =
IRB.CreateLoad(IRB.getInt64Ty(), HiddenDynLDSSize);
@@ -510,13 +510,13 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
// Create a call to malloc function which does device global memory allocation
// with size equals to all LDS global accesses size in this kernel.
- FunctionCallee AMDGPUMallocReturn = M.getOrInsertFunction(
+ FunctionCallee AMDGPUMallocFunc = M.getOrInsertFunction(
StringRef("malloc"),
FunctionType::get(IRB.getPtrTy(1), {IRB.getInt64Ty()}, false));
- Value *MCI = IRB.CreateCall(AMDGPUMallocReturn, {CurrMallocSize});
+ Value *MCI = IRB.CreateCall(AMDGPUMallocFunc, {CurrMallocSize});
// create store of malloc to new global
- IRB.CreateStore(MCI, MallocLDSGlobal);
+ IRB.CreateStore(MCI, SwLDS);
// Create branch to PrevEntryBlock
IRB.CreateBr(PrevEntryBlock);
@@ -558,7 +558,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
StringRef("free"),
FunctionType::get(IRB.getVoidTy(), {IRB.getPtrTy()}, false));
- Value *MallocPtr = IRB.CreateLoad(IRB.getPtrTy(), MallocLDSGlobal);
+ Value *MallocPtr = IRB.CreateLoad(IRB.getPtrTy(), SwLDS);
IRB.CreateCall(AMDGPUFreeReturn, {MallocPtr});
IRB.CreateBr(EndBlock);
@@ -579,10 +579,10 @@ Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
Type *Int32Ty = Type::getInt32Ty(Ctx);
auto &LDSParams = KernelToLDSParametersMap[Func];
- GlobalVariable *MallocMetadataGlobal = LDSParams.MallocMetadataGlobal;
- assert(MallocMetadataGlobal);
- StructType *MallocMetadataStructType =
- cast<StructType>(MallocMetadataGlobal->getValueType());
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDSMetadata);
+ StructType *SwLDSMetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
ArrayType *KernelOffsetsType = ArrayType::get(Int32Ty, Variables.size());
SmallVector<Constant *> Elements;
@@ -597,7 +597,7 @@ Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
ConstantInt::get(Int32Ty, Idx1),
ConstantInt::get(Int32Ty, Idx2)};
Constant *GEP = ConstantExpr::getGetElementPtr(
- MallocMetadataStructType, MallocMetadataGlobal, GEPIdx, true);
+ SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
auto elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
Elements.push_back(elt);
} else
@@ -610,7 +610,7 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
NonKernelLDSParameters &NKLDSParams) {
// Base table will have single row, with elements of the row
// placed as per kernel ID. Each element in the row corresponds
- // to addresss of malloc LDS global variable of the kernel.
+ // to addresss of "SW LDS" global of the kernel.
auto &Kernels = NKLDSParams.OrderedKernels;
LLVMContext &Ctx = M.getContext();
Type *Int32Ty = Type::getInt32Ty(Ctx);
@@ -620,11 +620,11 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
for (size_t i = 0; i < NumberKernels; i++) {
Function *Func = Kernels[i];
auto &LDSParams = KernelToLDSParametersMap[Func];
- GlobalVariable *MallocLDSGlobal = LDSParams.MallocLDSGlobal;
- assert(MallocLDSGlobal);
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ assert(SwLDS);
Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
- Constant *GEP = ConstantExpr::getGetElementPtr(
- MallocLDSGlobal->getType(), MallocLDSGlobal, GEPIdx, true);
+ Constant *GEP =
+ ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
auto Elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
overallConstantExprElts[i] = Elt;
}
@@ -753,7 +753,6 @@ bool AMDGPUSwLowerLDS::run() {
for (auto &K : LDSAccesses) {
Function *F = K.first;
assert(isKernelLDS(F));
- assert(!K.second.empty());
if (!KernelToLDSParametersMap.contains(F)) {
KernelLDSParameters KernelLDSParams;
@@ -796,8 +795,8 @@ bool AMDGPUSwLowerLDS::run() {
removeFnAttrFromReachable(CG, Func, "amdgpu-no-workitem-id-y");
removeFnAttrFromReachable(CG, Func, "amdgpu-no-workitem-id-z");
reorderStaticDynamicIndirectLDSSet(LDSParams);
- populateMallocLDSGlobal(Func);
- populateMallocMetadataGlobal(Func);
+ populateSwLDSGlobal(Func);
+ populateSwMetadataGlobal(Func);
populateLDSToReplacementIndicesMap(Func);
DomTreeUpdater DTU(DTCallback(*Func),
DomTreeUpdater::UpdateStrategy::Lazy);
>From 4f0217679dc190fa4cfea3ff354b0b7e6588b574 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Wed, 17 Apr 2024 22:12:07 +0530
Subject: [PATCH 06/11] [AMDGPU] use gep with i8 base type when replacing LDS
accesses
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 4 ++--
.../amdgpu-sw-lower-lds-dynamic-indirect-access.ll | 8 ++++----
.../AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll | 4 ++--
...wer-lds-multi-static-dynamic-indirect-access.ll | 14 +++++++-------
.../amdgpu-sw-lower-lds-multiple-blocks-return.ll | 4 ++--
...-sw-lower-lds-static-dynamic-indirect-access.ll | 8 ++++----
.../amdgpu-sw-lower-lds-static-dynamic-lds-test.ll | 8 ++++----
...er-lds-static-indirect-access-function-param.ll | 2 +-
...u-sw-lower-lds-static-indirect-access-nested.ll | 4 ++--
.../amdgpu-sw-lower-lds-static-indirect-access.ll | 8 ++++----
.../AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll | 4 ++--
11 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 75542100e2729d..dd97fad206cd14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -390,7 +390,7 @@ void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
Value *Load = IRB.CreateLoad(Int32Ty, GEP);
Value *BasePlusOffset =
- IRB.CreateInBoundsGEP(GV->getType(), SwLDS, {Load});
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Load});
replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
};
@@ -704,7 +704,7 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
OffsetLoad = IRB.CreateIntToPtr(OffsetLoad, GV->getType());
OffsetLoad = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
Value *BasePlusOffset =
- IRB.CreateInBoundsGEP(GV->getType(), BasePtr, {OffsetLoad});
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BasePtr, {OffsetLoad});
replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
return;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
index b154bfea786305..f853debf392acc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -17,13 +17,13 @@ define void @use_variables() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
; CHECK-NEXT: ret void
@@ -71,9 +71,9 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
; CHECK-NEXT: call void @use_variables()
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
index f6f3fed2f6f5fe..f80a2dc835ad0b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i8 8, ptr addrspace(3) [[TMP11]], align 8
; CHECK-NEXT: br label [[CONDFREE:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
index fd45f3d8c346eb..eb03fb7005dc70 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -17,13 +17,13 @@ define void @use_variables_1() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
; CHECK-NEXT: ret void
@@ -43,13 +43,13 @@ define void @use_variables_2() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 1
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP15]], align 2
; CHECK-NEXT: ret void
@@ -97,7 +97,7 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
; CHECK-NEXT: call void @use_variables_1()
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
; CHECK-NEXT: br label [[CONDFREE:%.*]]
@@ -161,9 +161,9 @@ define amdgpu_kernel void @k1() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP15]]
; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP31]]
; CHECK-NEXT: call void @use_variables_1()
; CHECK-NEXT: call void @use_variables_2()
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP16]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
index 101c3af4ff9676..c1cee3f8d7ddf8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -28,9 +28,9 @@ define amdgpu_kernel void @test_kernel() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr addrspace(1)
; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP11]] to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
index a17fa9df89b323..4c699200415721 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -18,13 +18,13 @@ define void @use_variables() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i8 3, ptr addrspace(3) [[TMP15]], align 8
; CHECK-NEXT: ret void
@@ -72,9 +72,9 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
; CHECK-NEXT: call void @use_variables()
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 1
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
index 90100b57a0c291..0f6ed65d53d9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -52,13 +52,13 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]]
; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP23]], align 4
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP25]], align 8
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP27]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
index 214fa7f1a9cb00..ee68cb6a5be2dd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @my_kernel() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, i32 [[TMP8]]
; CHECK-NEXT: [[LDS_PTR:%.*]] = getelementptr [1024 x i32], ptr addrspace(3) [[TMP9]], i32 0, i32 0
; CHECK-NEXT: call void @my_function(ptr addrspace(3) [[LDS_PTR]])
; CHECK-NEXT: br label [[CONDFREE:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
index 47188f252eee11..cb7d619299119d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
@@ -185,7 +185,7 @@ define private void @store_A() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8
; CHECK-NEXT: ret void
@@ -204,7 +204,7 @@ define private ptr @get_B_ptr() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
; CHECK-NEXT: ret ptr [[TMP10]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
index aa93f76e4b5463..b08b17b90e1fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -17,13 +17,13 @@ define void @use_variables() {
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[TMP11]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(3)
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP10]], i32 [[TMP14]]
; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4
@@ -58,9 +58,9 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: call void @use_variables()
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 1
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
index 733f9cfab1e820..15f33d6c6005df 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -32,9 +32,9 @@ define amdgpu_kernel void @k0() {
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
; CHECK-NEXT: store i8 7, ptr addrspace(3) [[TMP9]], align 4
; CHECK-NEXT: store i32 8, ptr addrspace(3) [[TMP11]], align 2
; CHECK-NEXT: br label [[CONDFREE:%.*]]
>From 4534737192e970d258235cb37834b09c122622ad Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 18 Apr 2024 11:14:40 +0530
Subject: [PATCH 07/11] [AMDGPU] Remove adding amdgpu-sw-lower-lds to pipleine.
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 -----
1 file changed, 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ee7f4f8e3a6fcc..6d75a634c82f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -677,11 +677,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
-
-#if __has_feature(address_sanitizer)
- EnableLowerModuleLDS = false;
- PM.addPass(AMDGPUSwLowerLDSPass());
-#endif
});
PB.registerPeepholeEPCallback(
>From e60eb97856788ba52850d996ca4ddf30c2662bd7 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 18 Apr 2024 12:12:43 +0530
Subject: [PATCH 08/11] [AMDGPU] Update removeFnAttrFromReachable.
---
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp | 43 -------------------
1 file changed, 43 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 1fbf8e3fb7d061..dc68a49b99204d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -241,49 +241,6 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
}
-void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
- StringRef FnAttr) {
- KernelRoot->removeFnAttr(FnAttr);
-
- SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
- if (!Tmp.back())
- return;
-
- SmallPtrSet<Function *, 8> Visited;
- bool SeenUnknownCall = false;
-
- do {
- Function *F = Tmp.pop_back_val();
-
- for (auto &N : *CG[F]) {
- if (!N.second)
- continue;
-
- Function *Callee = N.second->getFunction();
- if (!Callee) {
- if (!SeenUnknownCall) {
- SeenUnknownCall = true;
-
- // If we see any indirect calls, assume nothing about potential
- // targets.
- // TODO: This could be refined to possible LDS global users.
- for (auto &N : *CG.getExternalCallingNode()) {
- Function *PotentialCallee = N.second->getFunction();
- if (!isKernelLDS(PotentialCallee))
- PotentialCallee->removeFnAttr(FnAttr);
- }
-
- continue;
- }
- }
-
- Callee->removeFnAttr(FnAttr);
- if (Visited.insert(Callee).second)
- Tmp.push_back(Callee);
- }
- } while (!Tmp.empty());
-}
-
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
StringRef FnAttr) {
KernelRoot->removeFnAttr(FnAttr);
>From 3516f6970670d9907b96cabb94a35b54dbc4b057 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 18 Apr 2024 16:07:56 +0530
Subject: [PATCH 09/11] [AMDGPU] code-format AMDGPUSwLowerLDS.cpp
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 34 ++++++++++-----------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index dd97fad206cd14..3b770789e9d729 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -276,23 +276,23 @@ void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
StructType *LDSItemTy =
StructType::create(Ctx, {Int32Ty, Int32Ty}, MDItemOS.str());
- auto buildInitializerForSwLDSMD = [&](SetVector<GlobalVariable *>
- &LDSGlobals) {
- for (auto &GV : LDSGlobals) {
- Type *Ty = GV->getValueType();
- const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
- Items.push_back(LDSItemTy);
- Constant *ItemStartOffset =
- ConstantInt::get(Int32Ty, LDSParams.MallocSize);
- uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
- Constant *AlignedSizeInBytesConst =
- ConstantInt::get(Int32Ty, AlignedSize);
- LDSParams.MallocSize += AlignedSize;
- Constant *InitItem = ConstantStruct::get(
- LDSItemTy, {ItemStartOffset, AlignedSizeInBytesConst});
- Initializers.push_back(InitItem);
- }
- };
+ auto buildInitializerForSwLDSMD =
+ [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ Type *Ty = GV->getValueType();
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ Items.push_back(LDSItemTy);
+ Constant *ItemStartOffset =
+ ConstantInt::get(Int32Ty, LDSParams.MallocSize);
+ uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
+ Constant *AlignedSizeInBytesConst =
+ ConstantInt::get(Int32Ty, AlignedSize);
+ LDSParams.MallocSize += AlignedSize;
+ Constant *InitItem = ConstantStruct::get(
+ LDSItemTy, {ItemStartOffset, AlignedSizeInBytesConst});
+ Initializers.push_back(InitItem);
+ }
+ };
buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
>From f8622d26597421bd1ee47dd2af6da1f46f93271e Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 19 Apr 2024 16:15:54 +0530
Subject: [PATCH 10/11] [AMDGPU] Update patch based on review comments:3
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 134 ++++++++++----------
1 file changed, 67 insertions(+), 67 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 3b770789e9d729..bf6288a91bd1c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -112,14 +112,14 @@ struct LDSAccessTypeInfo {
// to replace a LDS global uses with corresponding offset
// in to device global memory.
struct KernelLDSParameters {
- GlobalVariable *SwLDS{nullptr};
- GlobalVariable *SwLDSMetadata{nullptr};
+ GlobalVariable *SwLDS = nullptr;
+ GlobalVariable *SwLDSMetadata = nullptr;
LDSAccessTypeInfo DirectAccess;
LDSAccessTypeInfo IndirectAccess;
DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
LDSToReplacementIndicesMap;
- int32_t KernelId{-1};
- uint32_t MallocSize{0};
+ int32_t KernelId = -1;
+ uint32_t MallocSize = 0;
};
// Struct to store infor for creation of offset table
@@ -133,11 +133,11 @@ struct NonKernelLDSParameters {
class AMDGPUSwLowerLDS {
public:
- AMDGPUSwLowerLDS(Module &mod, DomTreeCallback Callback)
- : M(mod), IRB(M.getContext()), DTCallback(Callback) {}
+ AMDGPUSwLowerLDS(Module &Mod, DomTreeCallback Callback)
+ : M(Mod), IRB(M.getContext()), DTCallback(Callback) {}
bool run();
- void getUsesOfLDSByNonKernels(CallGraph const &CG,
- FunctionVariableMap &functions);
+ void getUsesOfLDSByNonKernels(const CallGraph &CG,
+ FunctionVariableMap &Functions);
SetVector<Function *>
getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &&Kernels);
SetVector<GlobalVariable *>
@@ -166,23 +166,22 @@ class AMDGPUSwLowerLDS {
template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
// Sort the vector of globals or Functions based on their name.
// Returns a SetVector of globals/Functions.
- llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
+ sort(V, [](const auto *L, const auto *R) {
return L->getName() < R->getName();
});
- return {std::move(SetVector<T>(V.begin(), V.end()))};
+ return {SetVector<T>(V.begin(), V.end())};
}
SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
SetVector<GlobalVariable *> &&Variables) {
// Sort all the non-kernel LDS accesses based on their name.
- SetVector<GlobalVariable *> Ordered = sortByName(
+ return sortByName(
std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
- return std::move(Ordered);
}
SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
SetVector<Function *> &&Kernels) {
- // Sort the non-kernels accessing LDS based on theor name.
+ // Sort the non-kernels accessing LDS based on their name.
// Also assign a kernel ID metadata based on the sorted order.
LLVMContext &Ctx = M.getContext();
if (Kernels.size() > UINT32_MAX) {
@@ -205,13 +204,12 @@ SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
}
void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels(
- CallGraph const &CG, FunctionVariableMap &functions) {
+ const CallGraph &CG, FunctionVariableMap &functions) {
// Get uses from the current function, excluding uses by called functions
// Two output variables to avoid walking the globals list twice
for (auto &GV : M.globals()) {
- if (!AMDGPU::isLDSVariableToLower(GV)) {
+ if (!AMDGPU::isLDSVariableToLower(GV))
continue;
- }
if (GV.isAbsoluteSymbolRef()) {
report_fatal_error(
@@ -221,9 +219,8 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels(
for (User *V : GV.users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
- if (!isKernelLDS(F)) {
+ if (!isKernelLDS(F))
functions[F].insert(&GV);
- }
}
}
}
@@ -355,7 +352,6 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
return false;
};
GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
- return;
}
void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
@@ -368,15 +364,17 @@ void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
cast<StructType>(SwLDSMetadata->getValueType());
Type *Int32Ty = IRB.getInt32Ty();
+ auto &IndirectAccess = LDSParams.IndirectAccess;
+ auto &DirectAccess = LDSParams.DirectAccess;
// Replace all uses of LDS global in this Function with a Replacement.
auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
for (auto &GV : LDSGlobals) {
// Do not generate instructions if LDS access is in non-kernel
// i.e indirect-access.
- if ((LDSParams.IndirectAccess.StaticLDSGlobals.contains(GV) ||
- LDSParams.IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
- (!LDSParams.DirectAccess.StaticLDSGlobals.contains(GV) &&
- !LDSParams.DirectAccess.DynamicLDSGlobals.contains(GV)))
+ if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
+ IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
+ (!DirectAccess.StaticLDSGlobals.contains(GV) &&
+ !DirectAccess.DynamicLDSGlobals.contains(GV)))
continue;
auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
assert(Indices.size() == 3);
@@ -391,18 +389,21 @@ void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
Value *Load = IRB.CreateLoad(Int32Ty, GEP);
Value *BasePlusOffset =
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Load});
+ LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replacing LDS "
+ << GV->getName().str());
replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
};
- ReplaceLDSGlobalUses(LDSParams.DirectAccess.StaticLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams.IndirectAccess.StaticLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams.DirectAccess.DynamicLDSGlobals);
- ReplaceLDSGlobalUses(LDSParams.IndirectAccess.DynamicLDSGlobals);
- return;
+ ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
+ ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
}
void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
DomTreeUpdater &DTU) {
+ LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : "
+ << Func->getName().str());
auto &LDSParams = KernelToLDSParametersMap[Func];
auto &Ctx = M.getContext();
auto *PrevEntryBlock = &Func->getEntryBlock();
@@ -570,7 +571,6 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
{DominatorTree::Insert, MallocBlock, PrevEntryBlock},
{DominatorTree::Insert, CondFreeBlock, FreeBlock},
{DominatorTree::Insert, FreeBlock, EndBlock}});
- return;
}
Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
@@ -581,27 +581,28 @@ Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
assert(SwLDSMetadata);
- StructType *SwLDSMetadataStructType =
+ auto *SwLDSMetadataStructType =
cast<StructType>(SwLDSMetadata->getValueType());
ArrayType *KernelOffsetsType = ArrayType::get(Int32Ty, Variables.size());
SmallVector<Constant *> Elements;
for (size_t i = 0; i < Variables.size(); i++) {
GlobalVariable *GV = Variables[i];
- if (LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
- auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
- uint32_t Idx0 = Indices[0];
- uint32_t Idx1 = Indices[1];
- uint32_t Idx2 = Indices[2];
- Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Idx0),
- ConstantInt::get(Int32Ty, Idx1),
- ConstantInt::get(Int32Ty, Idx2)};
- Constant *GEP = ConstantExpr::getGetElementPtr(
- SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
- auto elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
- Elements.push_back(elt);
- } else
+ if (!LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
Elements.push_back(PoisonValue::get(Int32Ty));
+ continue;
+ }
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
+ uint32_t Idx0 = Indices[0];
+ uint32_t Idx1 = Indices[1];
+ uint32_t Idx2 = Indices[2];
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Idx0),
+ ConstantInt::get(Int32Ty, Idx1),
+ ConstantInt::get(Int32Ty, Idx2)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
+ SwLDSMetadata, GEPIdx, true);
+ auto elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
+ Elements.push_back(elt);
}
return ConstantArray::get(KernelOffsetsType, Elements);
}
@@ -616,7 +617,7 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
Type *Int32Ty = Type::getInt32Ty(Ctx);
const size_t NumberKernels = Kernels.size();
ArrayType *AllKernelsOffsetsType = ArrayType::get(Int32Ty, NumberKernels);
- std::vector<Constant *> overallConstantExprElts(NumberKernels);
+ std::vector<Constant *> OverallConstantExprElts(NumberKernels);
for (size_t i = 0; i < NumberKernels; i++) {
Function *Func = Kernels[i];
auto &LDSParams = KernelToLDSParametersMap[Func];
@@ -626,10 +627,10 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
Constant *GEP =
ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
auto Elt = ConstantExpr::getPtrToInt(GEP, Int32Ty);
- overallConstantExprElts[i] = Elt;
+ OverallConstantExprElts[i] = Elt;
}
Constant *init =
- ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
+ ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
NKLDSParams.LDSBaseTable = new GlobalVariable(
M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
"llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
@@ -665,13 +666,12 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
overallConstantExprElts[i] =
getAddressesOfVariablesInKernel(Func, Variables);
}
- Constant *init =
+ Constant *Init =
ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
NKLDSParams.LDSOffsetTable = new GlobalVariable(
- M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
+ M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
"llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::CONSTANT_ADDRESS);
- return;
}
void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
@@ -679,6 +679,8 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
NonKernelLDSParameters &NKLDSParams) {
// Replace LDS access in non-kernel with replacement queried from
// Base table and offset from offset table.
+ LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
+ << Func->getName().str());
auto *EntryBlock = &Func->getEntryBlock();
IRB.SetInsertPoint(EntryBlock, EntryBlock->begin());
Function *Decl =
@@ -705,31 +707,29 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
OffsetLoad = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
Value *BasePlusOffset =
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BasePtr, {OffsetLoad});
+ LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
+ << GV->getName().str());
replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
}
- return;
}
static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
// Sort Static, dynamic LDS globals which are either
// direct or indirect access on basis of name.
- LDSParams.DirectAccess.StaticLDSGlobals =
- sortByName(std::vector<GlobalVariable *>(
- LDSParams.DirectAccess.StaticLDSGlobals.begin(),
- LDSParams.DirectAccess.StaticLDSGlobals.end()));
- LDSParams.DirectAccess.DynamicLDSGlobals =
- sortByName(std::vector<GlobalVariable *>(
- LDSParams.DirectAccess.DynamicLDSGlobals.begin(),
- LDSParams.DirectAccess.DynamicLDSGlobals.end()));
- LDSParams.IndirectAccess.StaticLDSGlobals =
- sortByName(std::vector<GlobalVariable *>(
- LDSParams.IndirectAccess.StaticLDSGlobals.begin(),
- LDSParams.IndirectAccess.StaticLDSGlobals.end()));
- LDSParams.IndirectAccess.DynamicLDSGlobals =
- sortByName(std::vector<GlobalVariable *>(
- LDSParams.IndirectAccess.DynamicLDSGlobals.begin(),
- LDSParams.IndirectAccess.DynamicLDSGlobals.end()));
- return;
+ auto &DirectAccess = LDSParams.DirectAccess;
+ auto &IndirectAccess = LDSParams.IndirectAccess;
+ LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
+ DirectAccess.StaticLDSGlobals.end()));
+ LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
+ DirectAccess.DynamicLDSGlobals.end()));
+ LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
+ IndirectAccess.StaticLDSGlobals.end()));
+ LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
+ IndirectAccess.DynamicLDSGlobals.end()));
}
bool AMDGPUSwLowerLDS::run() {
>From 34ed626fd0b95804c967f368e6018d869db78551 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 19 Apr 2024 17:26:39 +0530
Subject: [PATCH 11/11] [AMDGPU] Update patch based on review comments:4
---
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index bf6288a91bd1c2..555e3a43e7fd70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -575,8 +575,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
Function *Func, SetVector<GlobalVariable *> &Variables) {
- LLVMContext &Ctx = M.getContext();
- Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int32Ty = IRB.getInt32Ty();
auto &LDSParams = KernelToLDSParametersMap[Func];
GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
@@ -613,8 +612,7 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
// placed as per kernel ID. Each element in the row corresponds
// to addresss of "SW LDS" global of the kernel.
auto &Kernels = NKLDSParams.OrderedKernels;
- LLVMContext &Ctx = M.getContext();
- Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int32Ty = IRB.getInt32Ty();
const size_t NumberKernels = Kernels.size();
ArrayType *AllKernelsOffsetsType = ArrayType::get(Int32Ty, NumberKernels);
std::vector<Constant *> OverallConstantExprElts(NumberKernels);
@@ -650,12 +648,11 @@ void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
auto &Kernels = NKLDSParams.OrderedKernels;
assert(!Variables.empty());
assert(!Kernels.empty());
- LLVMContext &Ctx = M.getContext();
const size_t NumberVariables = Variables.size();
const size_t NumberKernels = Kernels.size();
ArrayType *KernelOffsetsType =
- ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
+ ArrayType::get(IRB.getInt32Ty(), NumberVariables);
ArrayType *AllKernelsOffsetsType =
ArrayType::get(KernelOffsetsType, NumberKernels);
More information about the llvm-commits
mailing list