[llvm] [AMDGPU] Introduce "amdgpu-sw-lower-lds" pass to lower LDS accesses. (PR #87265)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 24 22:06:55 PDT 2024
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/87265
>From abfcc87a4f47c09e4fd2fe7f68f1d5ce26dce678 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 7 Mar 2024 12:40:41 +0530
Subject: [PATCH] [AMDGPU] Introduce amdgpu-sw-lower-lds pass to lower LDS
accesses. (#87265)
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 1335 +++++++++++++++++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
...-lower-lds-dynamic-indirect-access-asan.ll | 260 ++++
...pu-sw-lower-lds-dynamic-indirect-access.ll | 142 ++
...dgpu-sw-lower-lds-dynamic-lds-test-asan.ll | 113 ++
.../amdgpu-sw-lower-lds-dynamic-lds-test.ll | 87 ++
...lti-static-dynamic-indirect-access-asan.ll | 385 +++++
...ds-multi-static-dynamic-indirect-access.ll | 227 +++
...w-lower-lds-multiple-blocks-return-asan.ll | 112 ++
...gpu-sw-lower-lds-multiple-blocks-return.ll | 112 ++
...lds-static-dynamic-indirect-access-asan.ll | 261 ++++
...ower-lds-static-dynamic-indirect-access.ll | 143 ++
...-lower-lds-static-dynamic-lds-test-asan.ll | 213 +++
...pu-sw-lower-lds-static-dynamic-lds-test.ll | 120 ++
...w-lower-lds-static-indirect-access-asan.ll | 224 +++
...tic-indirect-access-function-param-asan.ll | 152 ++
...s-static-indirect-access-function-param.ll | 102 ++
...-lds-static-indirect-access-nested-asan.ll | 279 ++++
...lower-lds-static-indirect-access-nested.ll | 279 ++++
...gpu-sw-lower-lds-static-indirect-access.ll | 128 ++
...mdgpu-sw-lower-lds-static-lds-test-asan.ll | 159 ++
...lds-static-lds-test-atomic-cmpxchg-asan.ll | 132 ++
...ower-lds-static-lds-test-atomicrmw-asan.ll | 214 +++
.../amdgpu-sw-lower-lds-static-lds-test.ll | 85 ++
27 files changed, 5278 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index afb8f2d93f0f15..c50474893eb7d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -268,6 +268,17 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
bool GlobalOpt;
};
+void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
+extern char &AMDGPUSwLowerLDSLegacyPassID;
+ModulePass *
+createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
+
+struct AMDGPUSwLowerLDSPass : PassInfoMixin<AMDGPUSwLowerLDSPass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUSwLowerLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
class AMDGPUCodeGenPreparePass
: public PassInfoMixin<AMDGPUCodeGenPreparePass> {
private:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5c068b5695c8d1..d8741b4b06a984 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
AMDGPULowerBufferFatPointersPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
+MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
MODULE_PASS("amdgpu-perf-hint",
AMDGPUPerfHintAnalysisPass(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
new file mode 100644
index 00000000000000..d2ff7a154d0df5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -0,0 +1,1335 @@
+//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the local data store, LDS, uses in kernel and non-kernel
+// functions in module to use dynamically allocated global memory.
+// Packed LDS Layout is emulated in the global memory.
+// The lowered memory instructions from LDS to global memory are then
+// instrumented for address sanitizer, to catch addressing errors.
+// This pass only work when address sanitizer has been enabled and has
+// instrumented the IR. It identifies that IR has been instrumented using
+// "nosanitize_address" module flag.
+//
+// Replacement of Kernel LDS accesses:
+// For a kernel, LDS access can be static or dynamic which are direct
+// (accessed within kernel) and indirect (accessed through non-kernels).
+// All these LDS accesses corresponding to kernel will be packed together,
+// where all static LDS accesses will be allocated first and then dynamic
+// LDS follows. The total size with alignment is calculated. A new LDS global
+// will be created for the kernel called "SW LDS" and it will have the
+// attribute "amdgpu-lds-size" attached with value of the size calculated.
+// All the LDS accesses in the module will be replaced by GEP with offset
+// into the "Sw LDS".
+// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
+// the dynamic LDS. This will be marked used by kernel and will have
+// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
+// LDS allocation starts after all static LDS allocation.
+//
+// A device global memory equal to the total LDS size will be allocated.
+// At the prologue of the kernel, a single work-item from the
+// work-group, does a "malloc" and stores the pointer of the
+// allocation in "SW LDS".
+//
+// To store the offsets corresponding to all LDS accesses, another global
+// variable is created which will be called "SW LDS metadata" in this pass.
+// - SW LDS Global:
+// It is LDS global of ptr type with name
+// "llvm.amdgcn.sw.lds.<kernel-name>".
+// - Metadata Global:
+// It is of struct type, with n members. n equals the number of LDS
+// globals accessed by the kernel(direct and indirect). Each member of
+// struct is another struct of type {i32, i32, i32}. First member
+// corresponds to offset, second member corresponds to size of LDS global
+// being replaced and third represents the total aligned size. It will
+// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
+// an intializer with static LDS related offsets and sizes initialized.
+// But for dynamic LDS related entries, offsets will be intialized to
+// previous static LDS allocation end offset. Sizes for them will be zero
+// initially. These dynamic LDS offset and size values will be updated
+// within the kernel, since kernel can read the dynamic LDS size
+// allocation done at runtime with query to "hidden_dynamic_lds_size"
+// hidden kernel argument.
+//
+// At the epilogue of kernel, allocated memory would be made free by the same
+// single work-item.
+//
+// Replacement of non-kernel LDS accesses:
+// Multiple kernels can access the same non-kernel function.
+// All the kernels accessing LDS through non-kernels are sorted and
+// assigned a kernel-id. All the LDS globals accessed by non-kernels
+// are sorted. This information is used to build two tables:
+// - Base table:
+// Base table will have single row, with elements of the row
+// placed as per kernel ID. Each element in the row corresponds
+// to ptr of "SW LDS" variable created for that kernel.
+// - Offset table:
+// Offset table will have multiple rows and columns.
+// Rows are assumed to be from 0 to (n-1). n is total number
+// of kernels accessing the LDS through non-kernels.
+// Each row will have m elements. m is the total number of
+// unique LDS globals accessed by all non-kernels.
+// Each element in the row correspond to the ptr of
+// the replacement of LDS global done by that particular kernel.
+// A LDS variable in non-kernel will be replaced based on the information
+// from base and offset tables. Based on kernel-id query, ptr of "SW
+// LDS" for that corresponding kernel is obtained from base table.
+// The Offset into the base "SW LDS" is obtained from
+// corresponding element in offset table. With this information, replacement
+// value is obtained.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUAsanInstrumentation.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-sw-lower-lds"
+#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+cl::opt<bool>
+ AsanInstrumentLDS("amdgpu-asan-instrument-lds",
+ cl::desc("Run asan instrumentation on LDS instructions "
+ "lowered to global memory"),
+ cl::init(true), cl::Hidden);
+
+using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
+
+struct LDSAccessTypeInfo {
+ SetVector<GlobalVariable *> StaticLDSGlobals;
+ SetVector<GlobalVariable *> DynamicLDSGlobals;
+};
+
+// Struct to hold all the Metadata required for a kernel
+// to replace a LDS global uses with corresponding offset
+// in to device global memory.
+struct KernelLDSParameters {
+ GlobalVariable *SwLDS = nullptr;
+ GlobalVariable *SwDynLDS = nullptr;
+ GlobalVariable *SwLDSMetadata = nullptr;
+ LDSAccessTypeInfo DirectAccess;
+ LDSAccessTypeInfo IndirectAccess;
+ DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
+ LDSToReplacementIndicesMap;
+ uint32_t MallocSize = 0;
+ uint32_t LDSSize = 0;
+ SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
+};
+
+// Struct to store information for creation of offset table
+// for all the non-kernel LDS accesses.
+struct NonKernelLDSParameters {
+ GlobalVariable *LDSBaseTable = nullptr;
+ GlobalVariable *LDSOffsetTable = nullptr;
+ SetVector<Function *> OrderedKernels;
+ SetVector<GlobalVariable *> OrdereLDSGlobals;
+};
+
+struct AsanInstrumentInfo {
+ int Scale = 0;
+ uint32_t Offset = 0;
+ SetVector<Instruction *> Instructions;
+};
+
+struct FunctionsAndLDSAccess {
+ DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
+ SetVector<Function *> KernelsWithIndirectLDSAccess;
+ SetVector<Function *> NonKernelsWithLDSArgument;
+ SetVector<GlobalVariable *> AllNonKernelLDSAccess;
+ FunctionVariableMap NonKernelToLDSAccessMap;
+};
+
+class AMDGPUSwLowerLDS {
+public:
+ AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
+ DomTreeCallback Callback)
+ : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
+ bool run();
+ void getUsesOfLDSByNonKernels();
+ void getNonKernelsWithLDSArguments(const CallGraph &CG);
+ SetVector<Function *>
+ getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
+ SetVector<GlobalVariable *>
+ getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
+ void buildSwLDSGlobal(Function *Func);
+ void buildSwDynLDSGlobal(Function *Func);
+ void populateSwMetadataGlobal(Function *Func);
+ void populateSwLDSAttributeAndMetadata(Function *Func);
+ void populateLDSToReplacementIndicesMap(Function *Func);
+ void getLDSMemoryInstructions(Function *Func,
+ SetVector<Instruction *> &LDSInstructions);
+ void replaceKernelLDSAccesses(Function *Func);
+ Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
+ Value *LDSPtr);
+ void translateLDSMemoryOperationsToGlobalMemory(
+ Function *Func, Value *LoadMallocPtr,
+ SetVector<Instruction *> &LDSInstructions);
+ void poisonRedzones(Function *Func, Value *MallocPtr);
+ void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
+ void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
+ void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
+ Constant *
+ getAddressesOfVariablesInKernel(Function *Func,
+ SetVector<GlobalVariable *> &Variables);
+ void lowerNonKernelLDSAccesses(Function *Func,
+ SetVector<GlobalVariable *> &LDSGlobals,
+ NonKernelLDSParameters &NKLDSParams);
+ void
+ updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
+ Value *HiddenDynLDSSize,
+ SetVector<GlobalVariable *> &DynamicLDSGlobals);
+ void initAsanInfo();
+
+private:
+ Module &M;
+ const AMDGPUTargetMachine &AMDGPUTM;
+ IRBuilder<> IRB;
+ DomTreeCallback DTCallback;
+ FunctionsAndLDSAccess FuncLDSAccessInfo;
+ AsanInstrumentInfo AsanInfo;
+};
+
+template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
+ // Sort the vector of globals or Functions based on their name.
+ // Returns a SetVector of globals/Functions.
+ sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {SetVector<T>(V.begin(), V.end())};
+}
+
+SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
+ SetVector<GlobalVariable *> &Variables) {
+ // Sort all the non-kernel LDS accesses based on their name.
+ return sortByName(
+ std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
+}
+
+SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
+ SetVector<Function *> &Kernels) {
+ // Sort the non-kernels accessing LDS based on their name.
+ // Also assign a kernel ID metadata based on the sorted order.
+ LLVMContext &Ctx = M.getContext();
+ if (Kernels.size() > UINT32_MAX) {
+ report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
+ }
+ SetVector<Function *> OrderedKernels =
+ sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
+ for (size_t i = 0; i < Kernels.size(); i++) {
+ Metadata *AttrMDArgs[1] = {
+ ConstantAsMetadata::get(IRB.getInt32(i)),
+ };
+ Function *Func = OrderedKernels[i];
+ Func->setMetadata("llvm.amdgcn.lds.kernel.id",
+ MDNode::get(Ctx, AttrMDArgs));
+ }
+ return std::move(OrderedKernels);
+}
+
+void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
+ // Among the kernels accessing LDS, get list of
+ // Non-kernels to which a call is made and a ptr
+ // to addrspace(3) is passed as argument.
+ for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
+ Function *Func = K.first;
+ const CallGraphNode *CGN = CG[Func];
+ if (!CGN)
+ continue;
+ for (auto &I : *CGN) {
+ CallGraphNode *CallerCGN = I.second;
+ Function *CalledFunc = CallerCGN->getFunction();
+ if (!CalledFunc)
+ continue;
+ if (AMDGPU::isKernelLDS(CalledFunc))
+ continue;
+ for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
+ AI != E; ++AI) {
+ Type *ArgTy = (*AI).getType();
+ if (!ArgTy->isPointerTy())
+ continue;
+ if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
+ // Also add the Calling function to KernelsWithIndirectLDSAccess list
+ // so that base table of LDS is generated.
+ FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
+ }
+ }
+ }
+}
+
+void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
+ for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
+ if (!AMDGPU::isLDSVariableToLower(*GV))
+ continue;
+
+ for (User *V : GV->users()) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) && F->hasFnAttribute(Attribute::SanitizeAddress))
+ FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
+ }
+ }
+ }
+}
+
+static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
+ uint32_t Address) {
+ // Write the specified address into metadata where it can be retrieved by
+ // the assembler. Format is a half open range, [Address Address+1)
+ LLVMContext &Ctx = M.getContext();
+ auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ MDBuilder MDB(Ctx);
+ MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
+ ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
+}
+
+static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
+ bool IsDynLDS) {
+ if (Offset != 0) {
+ std::string Buffer;
+ raw_string_ostream SS{Buffer};
+ SS << Offset;
+ if (IsDynLDS)
+ SS << "," << Offset;
+ Func->addFnAttr("amdgpu-lds-size", Buffer);
+ }
+}
+
+static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
+ BasicBlock *Entry = &Func->getEntryBlock();
+ IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
+
+ Function *Decl =
+ Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+ Value *UseInstance[1] = {
+ Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
+
+ Builder.CreateCall(Decl, {},
+ {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
+}
+
+void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
+ // Create new LDS global required for each kernel to store
+ // device global memory pointer.
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ // Create new global pointer variable
+ LDSParams.SwLDS = new GlobalVariable(
+ M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
+ PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
+ nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ LDSParams.SwLDS->setSanitizerMetadata(MD);
+ return;
+}
+
+void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
+ // Create new Dyn LDS global if kernel accesses dyn LDS.
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
+ LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
+ return;
+ // Create new global pointer variable
+ auto emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
+ LDSParams.SwDynLDS = new GlobalVariable(
+ M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+ "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+ markUsedByKernel(Func, LDSParams.SwDynLDS);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ LDSParams.SwDynLDS->setSanitizerMetadata(MD);
+ return;
+}
+
+void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ bool IsDynLDSUsed = LDSParams.SwDynLDS ? true : false;
+ uint32_t Offset = LDSParams.LDSSize;
+ recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
+ addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
+ if (LDSParams.SwDynLDS)
+ recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
+}
+
+void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
+ // Create new metadata global for every kernel and initialize the
+ // start offsets and sizes corresponding to each LDS accesses.
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ auto &Ctx = M.getContext();
+ auto &DL = M.getDataLayout();
+ std::vector<Type *> Items;
+ Type *Int32Ty = IRB.getInt32Ty();
+ std::vector<Constant *> Initializers;
+ Align MaxAlignment(1);
+ auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
+ Align GVAlign = AMDGPU::getAlign(DL, GV);
+ MaxAlignment = std::max(MaxAlignment, GVAlign);
+ };
+
+ for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
+ UpdateMaxAlignment(GV);
+
+ //{StartOffset, AlignedSizeInBytes}
+ SmallString<128> MDItemStr;
+ raw_svector_ostream MDItemOS(MDItemStr);
+ MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
+
+ StructType *LDSItemTy =
+ StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
+ uint32_t &MallocSize = LDSParams.MallocSize;
+ SetVector<GlobalVariable *> UniqueLDSGlobals;
+ int AsanScale = AsanInfo.Scale;
+ auto buildInitializerForSwLDSMD =
+ [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ if (is_contained(UniqueLDSGlobals, GV))
+ continue;
+ UniqueLDSGlobals.insert(GV);
+
+ Type *Ty = GV->getValueType();
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ Items.push_back(LDSItemTy);
+ Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
+ Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
+ // Get redzone size corresponding a size.
+ const uint64_t RightRedzoneSize =
+ AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
+ // Update MallocSize with current size and redzone size.
+ MallocSize += SizeInBytes;
+ if (!AMDGPU::isDynamicLDS(*GV))
+ LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
+ RightRedzoneSize);
+ MallocSize += RightRedzoneSize;
+ // Align current size plus redzone.
+ uint64_t AlignedSize =
+ alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
+ Constant *AlignedSizeInBytesConst =
+ ConstantInt::get(Int32Ty, AlignedSize);
+ // Align MallocSize
+ MallocSize = alignTo(MallocSize, MaxAlignment);
+ Constant *InitItem =
+ ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
+ AlignedSizeInBytesConst});
+ Initializers.push_back(InitItem);
+ }
+ };
+ SetVector<GlobalVariable *> SwLDSVector;
+ SwLDSVector.insert(LDSParams.SwLDS);
+ buildInitializerForSwLDSMD(SwLDSVector);
+ buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
+ buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
+
+ // Update the LDS size used by the kernel.
+ Type *Ty = LDSParams.SwLDS->getValueType();
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
+ LDSParams.LDSSize = AlignedSize;
+ SmallString<128> MDTypeStr;
+ raw_svector_ostream MDTypeOS(MDTypeStr);
+ MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
+ StructType *MetadataStructType =
+ StructType::create(Ctx, Items, MDTypeOS.str());
+ SmallString<128> MDStr;
+ raw_svector_ostream MDOS(MDStr);
+ MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
+ LDSParams.SwLDSMetadata = new GlobalVariable(
+ M, MetadataStructType, false, GlobalValue::InternalLinkage,
+ PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
+ Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
+ LDSParams.SwLDSMetadata->setInitializer(data);
+ assert(LDSParams.SwLDS);
+ // Set the alignment to MaxAlignment for SwLDS.
+ LDSParams.SwLDS->setAlignment(MaxAlignment);
+ if (LDSParams.SwDynLDS)
+ LDSParams.SwDynLDS->setAlignment(MaxAlignment);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
+ return;
+}
+
+void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
+ // Fill the corresponding LDS replacement indices for each LDS access
+ // related to this kernel.
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ SetVector<GlobalVariable *> UniqueLDSGlobals;
+ auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
+ uint32_t &Idx) {
+ for (auto &GV : LDSGlobals) {
+ if (is_contained(UniqueLDSGlobals, GV))
+ continue;
+ UniqueLDSGlobals.insert(GV);
+ LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
+ ++Idx;
+ }
+ };
+ uint32_t Idx = 0;
+ SetVector<GlobalVariable *> SwLDSVector;
+ SwLDSVector.insert(LDSParams.SwLDS);
+ PopulateIndices(SwLDSVector, Idx);
+ PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
+ PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
+ PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
+ return;
+}
+
+static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
+ Value *Replacement) {
+ // Replace all uses of LDS global in this Function with a Replacement.
+ auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
+ auto *V = U.getUser();
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ auto *Func1 = Inst->getParent()->getParent();
+ if (Func == Func1)
+ return true;
+ }
+ return false;
+ };
+ GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
+}
+
+void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ assert(SwLDS);
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDSMetadata);
+ StructType *SwLDSMetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
+ Type *Int32Ty = IRB.getInt32Ty();
+ auto &IndirectAccess = LDSParams.IndirectAccess;
+ auto &DirectAccess = LDSParams.DirectAccess;
+ // Replace all uses of LDS global in this Function with a Replacement.
+ SetVector<GlobalVariable *> UniqueLDSGlobals;
+ auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ // Do not generate instructions if LDS access is in non-kernel
+ // i.e indirect-access.
+ if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
+ IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
+ (!DirectAccess.StaticLDSGlobals.contains(GV) &&
+ !DirectAccess.DynamicLDSGlobals.contains(GV)))
+ continue;
+ if (is_contained(UniqueLDSGlobals, GV))
+ continue;
+ UniqueLDSGlobals.insert(GV);
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
+ assert(Indices.size() == 3);
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
+ ConstantInt::get(Int32Ty, Indices[1]),
+ ConstantInt::get(Int32Ty, Indices[2])};
+ Constant *GEP = ConstantExpr::getGetElementPtr(
+ SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
+ Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
+ Value *BasePlusOffset =
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
+ LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
+ false));
+ replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ }
+ };
+ ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
+ ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
+ ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
+}
+
+void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
+ Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
+ SetVector<GlobalVariable *> &DynamicLDSGlobals) {
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ Type *Int32Ty = IRB.getInt32Ty();
+
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDS && SwLDSMetadata);
+ StructType *MetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
+ unsigned MaxAlignment = SwLDS->getAlignment();
+ Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
+ Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
+
+ for (GlobalVariable *DynGV : DynamicLDSGlobals) {
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
+ // Update the Offset metadata.
+ Constant *Index0 = ConstantInt::get(Int32Ty, 0);
+ Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
+
+ Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
+ auto *GEPForOffset = IRB.CreateInBoundsGEP(
+ MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
+
+ IRB.CreateStore(*CurrMallocSize, GEPForOffset);
+ // Update the size and Aligned Size metadata.
+ Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
+ auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
+ {Index0, Index1, Index2Size});
+
+ Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
+ IRB.CreateStore(CurrDynLDSSize, GEPForSize);
+ Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
+ auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
+ MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
+
+ Value *AlignedDynLDSSize =
+ IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
+ AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
+ AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
+ IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
+
+ // Update the Current Malloc Size
+ *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
+ }
+}
+
+static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
+ DISubprogram *SP) {
+ assert(InsertBefore);
+ if (InsertBefore->getDebugLoc())
+ return InsertBefore->getDebugLoc();
+ if (SP)
+ return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
+ return DebugLoc();
+}
+
+void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
+ Function *Func, SetVector<Instruction *> &LDSInstructions) {
+ for (BasicBlock &BB : *Func) {
+ for (Instruction &Inst : BB) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ LDSInstructions.insert(&Inst);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ LDSInstructions.insert(&Inst);
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
+ if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ LDSInstructions.insert(&Inst);
+ } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
+ if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ LDSInstructions.insert(&Inst);
+ } else
+ continue;
+ }
+ }
+}
+
+Value *
+AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
+ Value *LDSPtr) {
+ assert(LDSPtr && "Invalid LDS pointer operand");
+ Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty());
+ Value *GEP =
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt});
+ return GEP;
+}
+
+void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
+ Function *Func, Value *LoadMallocPtr,
+ SetVector<Instruction *> &LDSInstructions) {
+ LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
+ << Func->getName());
+ for (Instruction *Inst : LDSInstructions) {
+ IRB.SetInsertPoint(Inst);
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ Value *LIOperand = LI->getPointerOperand();
+ Value *Replacement =
+ getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand);
+ LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
+ LI->getAlign(), LI->isVolatile());
+ NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
+ AsanInfo.Instructions.insert(NewLI);
+ LI->replaceAllUsesWith(NewLI);
+ LI->eraseFromParent();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ Value *SIOperand = SI->getPointerOperand();
+ Value *Replacement =
+ getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand);
+ StoreInst *NewSI = IRB.CreateAlignedStore(
+ SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
+ NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
+ AsanInfo.Instructions.insert(NewSI);
+ SI->replaceAllUsesWith(NewSI);
+ SI->eraseFromParent();
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ Value *RMWPtrOperand = RMW->getPointerOperand();
+ Value *RMWValOperand = RMW->getValOperand();
+ Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
+ LoadMallocPtr, RMWPtrOperand);
+ AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
+ RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
+ RMW->getOrdering(), RMW->getSyncScopeID());
+ NewRMW->setVolatile(RMW->isVolatile());
+ AsanInfo.Instructions.insert(NewRMW);
+ RMW->replaceAllUsesWith(NewRMW);
+ RMW->eraseFromParent();
+ } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ Value *XCHGPtrOperand = XCHG->getPointerOperand();
+ Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
+ LoadMallocPtr, XCHGPtrOperand);
+ AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
+ Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
+ XCHG->getAlign(), XCHG->getSuccessOrdering(),
+ XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
+ NewXCHG->setVolatile(XCHG->isVolatile());
+ AsanInfo.Instructions.insert(NewXCHG);
+ XCHG->replaceAllUsesWith(NewXCHG);
+ XCHG->eraseFromParent();
+ } else
+ report_fatal_error("Unimplemented LDS lowering instruction");
+ }
+}
+
+void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ Type *Int64Ty = IRB.getInt64Ty();
+ Type *VoidTy = IRB.getVoidTy();
+ FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
+ "__asan_poison_region",
+ FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
+
+ auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
+ size_t VecSize = RedzonesVec.size();
+ for (unsigned i = 0; i < VecSize; i++) {
+ auto &RedzonePair = RedzonesVec[i];
+ uint64_t RedzoneOffset = RedzonePair.first;
+ uint64_t RedzoneSize = RedzonePair.second;
+ Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
+ IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
+ Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
+ IRB.CreateCall(AsanPoisonRegion,
+ {RedzoneAddress, IRB.getInt64(RedzoneSize)});
+ }
+}
+
+void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
+ DomTreeUpdater &DTU) {
+ LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ auto &Ctx = M.getContext();
+ auto *PrevEntryBlock = &Func->getEntryBlock();
+ SetVector<Instruction *> LDSInstructions;
+ getLDSMemoryInstructions(Func, LDSInstructions);
+
+ // Create malloc block.
+ auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
+
+ // Create WIdBlock block which has instructions related to selection of
+ // {0,0,0} indiex work item in the work group.
+ auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
+ IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
+ DebugLoc FirstDL =
+ getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
+ IRB.SetCurrentDebugLocation(FirstDL);
+ Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
+ Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {});
+ Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {});
+ Value *XYOr = IRB.CreateOr(WIdx, WIdy);
+ Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
+ Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
+
+ // All work items will branch to PrevEntryBlock except {0,0,0} index
+ // work item which will branch to malloc block.
+ IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
+
+ // Malloc block
+ IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
+
+ // If Dynamic LDS globals are accessed by the kernel,
+ // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
+ // Update the corresponding metadata global entries for this dyn lds global.
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDS && SwLDSMetadata);
+ StructType *MetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
+ uint32_t MallocSize = 0;
+ Value *CurrMallocSize;
+ Type *Int32Ty = IRB.getInt32Ty();
+ Type *Int64Ty = IRB.getInt64Ty();
+
+ SetVector<GlobalVariable *> UniqueLDSGlobals;
+ auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
+ for (auto &GV : LDSGlobals) {
+ if (is_contained(UniqueLDSGlobals, GV))
+ continue;
+ UniqueLDSGlobals.insert(GV);
+ }
+ };
+
+ GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
+ GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
+ unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
+ UniqueLDSGlobals.clear();
+
+ if (NumStaticLDS) {
+ auto *GEPForEndStaticLDSOffset =
+ IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
+ {ConstantInt::get(Int32Ty, 0),
+ ConstantInt::get(Int32Ty, NumStaticLDS - 1),
+ ConstantInt::get(Int32Ty, 0)});
+
+ auto *GEPForEndStaticLDSSize =
+ IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
+ {ConstantInt::get(Int32Ty, 0),
+ ConstantInt::get(Int32Ty, NumStaticLDS - 1),
+ ConstantInt::get(Int32Ty, 2)});
+
+ Value *EndStaticLDSOffset =
+ IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
+ Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
+ CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
+ } else
+ CurrMallocSize = IRB.getInt32(MallocSize);
+
+ if (LDSParams.SwDynLDS) {
+ if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
+ report_fatal_error(
+ "Dynamic LDS size query is only supported for CO V5 and later.");
+ // Get size from hidden dyn_lds_size argument of kernel
+ Value *ImplicitArg =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+ Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
+ ImplicitArg->getType(), ImplicitArg,
+ {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
+ UniqueLDSGlobals.clear();
+ GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
+ GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
+ updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
+ UniqueLDSGlobals);
+ }
+
+ CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
+
+ // Create a call to malloc function which does device global memory allocation
+ // with size equals to all LDS global accesses size in this kernel.
+ Value *ReturnAddress =
+ IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, {IRB.getInt32(0)});
+ FunctionCallee MallocFunc = M.getOrInsertFunction(
+ StringRef("__asan_malloc_impl"),
+ FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
+ Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
+ Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
+
+ Value *MallocPtr =
+ IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS));
+
+ // Create store of malloc to new global
+ IRB.CreateStore(MallocPtr, SwLDS);
+
+ // Create calls to __asan_poison_region to poison redzones.
+ poisonRedzones(Func, MallocPtr);
+
+ // Create branch to PrevEntryBlock
+ IRB.CreateBr(PrevEntryBlock);
+
+ // Create wave-group barrier at the starting of Previous entry block
+ Type *Int1Ty = IRB.getInt1Ty();
+ IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
+ auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
+ XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
+ XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
+
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ // Load malloc pointer from Sw LDS.
+ Value *LoadMallocPtr =
+ IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS);
+
+ // Replace All uses of LDS globals with new LDS pointers.
+ replaceKernelLDSAccesses(Func);
+
+ // Replace Memory Operations on LDS with corresponding
+ // global memory pointers.
+ translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
+ LDSInstructions);
+
+ auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
+ auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
+ auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
+ for (BasicBlock &BB : *Func) {
+ if (!BB.empty()) {
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
+ RI->eraseFromParent();
+ IRB.SetInsertPoint(&BB, BB.end());
+ IRB.CreateBr(CondFreeBlock);
+ }
+ }
+ }
+
+ // Cond Free Block
+ IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+ IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
+
+ // Free Block
+ IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
+
+ // Free the previously allocate device global memory.
+ FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
+ StringRef("__asan_free_impl"),
+ FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
+ Value *ReturnAddr = IRB.CreateCall(
+ Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), IRB.getInt32(0));
+ Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
+ Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
+ IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
+
+ IRB.CreateBr(EndBlock);
+
+ // End Block
+ IRB.SetInsertPoint(EndBlock, EndBlock->begin());
+ IRB.CreateRetVoid();
+ // Update the DomTree with corresponding links to basic blocks.
+ DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
+ {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
+ {DominatorTree::Insert, CondFreeBlock, FreeBlock},
+ {DominatorTree::Insert, FreeBlock, EndBlock}});
+}
+
+Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
+ Function *Func, SetVector<GlobalVariable *> &Variables) {
+ Type *Int32Ty = IRB.getInt32Ty();
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+
+ GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
+ assert(SwLDSMetadata);
+ auto *SwLDSMetadataStructType =
+ cast<StructType>(SwLDSMetadata->getValueType());
+ ArrayType *KernelOffsetsType =
+ ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size());
+
+ SmallVector<Constant *> Elements;
+ for (size_t i = 0; i < Variables.size(); i++) {
+ GlobalVariable *GV = Variables[i];
+ if (!LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
+ Elements.push_back(
+ PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)));
+ continue;
+ }
+ auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
+ ConstantInt::get(Int32Ty, Indices[1]),
+ ConstantInt::get(Int32Ty, Indices[2])};
+ Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
+ SwLDSMetadata, GEPIdx, true);
+ Elements.push_back(GEP);
+ }
+ return ConstantArray::get(KernelOffsetsType, Elements);
+}
+
+void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
+ NonKernelLDSParameters &NKLDSParams) {
+ // Base table will have single row, with elements of the row
+ // placed as per kernel ID. Each element in the row corresponds
+ // to addresss of "SW LDS" global of the kernel.
+ auto &Kernels = NKLDSParams.OrderedKernels;
+ if (Kernels.empty())
+ return;
+ Type *Int32Ty = IRB.getInt32Ty();
+ const size_t NumberKernels = Kernels.size();
+ ArrayType *AllKernelsOffsetsType =
+ ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
+ std::vector<Constant *> OverallConstantExprElts(NumberKernels);
+ for (size_t i = 0; i < NumberKernels; i++) {
+ Function *Func = Kernels[i];
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ GlobalVariable *SwLDS = LDSParams.SwLDS;
+ assert(SwLDS);
+ Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
+ Constant *GEP =
+ ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
+ OverallConstantExprElts[i] = GEP;
+ }
+ Constant *init =
+ ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
+ NKLDSParams.LDSBaseTable = new GlobalVariable(
+ M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
+ "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::GLOBAL_ADDRESS);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
+}
+
+void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
+ NonKernelLDSParameters &NKLDSParams) {
+ // Offset table will have multiple rows and columns.
+ // Rows are assumed to be from 0 to (n-1). n is total number
+ // of kernels accessing the LDS through non-kernels.
+ // Each row will have m elements. m is the total number of
+ // unique LDS globals accessed by non-kernels.
+ // Each element in the row correspond to the address of
+ // the replacement of LDS global done by that particular kernel.
+ auto &Variables = NKLDSParams.OrdereLDSGlobals;
+ auto &Kernels = NKLDSParams.OrderedKernels;
+ if (Variables.empty() || Kernels.empty())
+ return;
+ const size_t NumberVariables = Variables.size();
+ const size_t NumberKernels = Kernels.size();
+
+ ArrayType *KernelOffsetsType =
+ ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
+
+ ArrayType *AllKernelsOffsetsType =
+ ArrayType::get(KernelOffsetsType, NumberKernels);
+ std::vector<Constant *> overallConstantExprElts(NumberKernels);
+ for (size_t i = 0; i < NumberKernels; i++) {
+ Function *Func = Kernels[i];
+ overallConstantExprElts[i] =
+ getAddressesOfVariablesInKernel(Func, Variables);
+ }
+ Constant *Init =
+ ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
+ NKLDSParams.LDSOffsetTable = new GlobalVariable(
+ M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
+ "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::GLOBAL_ADDRESS);
+ GlobalValue::SanitizerMetadata MD;
+ MD.NoAddress = true;
+ NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
+}
+
+void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
+ Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
+ NonKernelLDSParameters &NKLDSParams) {
+ // Replace LDS access in non-kernel with replacement queried from
+ // Base table and offset from offset table.
+ LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
+ << Func->getName());
+ auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+ IRB.SetInsertPoint(InsertAt);
+
+ // Get LDS memory instructions.
+ SetVector<Instruction *> LDSInstructions;
+ getLDSMemoryInstructions(Func, LDSInstructions);
+
+ Function *Decl =
+ Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+ auto *KernelId = IRB.CreateCall(Decl, {});
+ GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
+ GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
+ auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
+ Value *BaseGEP = IRB.CreateInBoundsGEP(
+ LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
+ Value *BaseLoad =
+ IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
+ Value *LoadMallocPtr =
+ IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
+
+ for (GlobalVariable *GV : LDSGlobals) {
+ auto GVIt = std::find(OrdereLDSGlobals.begin(), OrdereLDSGlobals.end(), GV);
+ assert(GVIt != OrdereLDSGlobals.end());
+ uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
+
+ Value *OffsetGEP = IRB.CreateInBoundsGEP(
+ LDSOffsetTable->getValueType(), LDSOffsetTable,
+ {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
+ Value *OffsetLoad =
+ IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
+ Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
+ Value *BasePlusOffset =
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
+ LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
+ << GV->getName());
+ replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
+ }
+ translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
+ LDSInstructions);
+}
+
+static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
+ // Sort Static, dynamic LDS globals which are either
+ // direct or indirect access on basis of name.
+ auto &DirectAccess = LDSParams.DirectAccess;
+ auto &IndirectAccess = LDSParams.IndirectAccess;
+ LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
+ DirectAccess.StaticLDSGlobals.end()));
+ LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
+ DirectAccess.DynamicLDSGlobals.end()));
+ LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
+ IndirectAccess.StaticLDSGlobals.end()));
+ LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
+ IndirectAccess.DynamicLDSGlobals.end()));
+}
+
+void AMDGPUSwLowerLDS::initAsanInfo() {
+ // Get Shadow mapping scale and offset.
+ unsigned LongSize =
+ M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
+ uint64_t Offset;
+ int Scale;
+ bool OrShadowOffset;
+ llvm::getAddressSanitizerParams(Triple(AMDGPUTM.getTargetTriple()), LongSize,
+ false, &Offset, &Scale, &OrShadowOffset);
+ AsanInfo.Scale = Scale;
+ AsanInfo.Offset = Offset;
+ return;
+}
+
+bool AMDGPUSwLowerLDS::run() {
+ bool Changed = false;
+
+ CallGraph CG = CallGraph(M);
+
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // Get all the direct and indirect access of LDS for all the kernels.
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // Utility to group LDS access into direct, indirect, static and dynamic.
+ auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
+ bool DirectAccess) {
+ for (auto &K : LDSAccesses) {
+ Function *F = K.first;
+ if (!F || K.second.empty())
+ continue;
+
+ assert(isKernelLDS(F));
+ if (!F->hasFnAttribute(Attribute::SanitizeAddress))
+ continue;
+
+ // Only inserts if key isn't already in the map.
+ FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
+ {F, KernelLDSParameters()});
+
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
+ if (!DirectAccess)
+ FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
+ for (GlobalVariable *GV : K.second) {
+ if (!DirectAccess) {
+ if (AMDGPU::isDynamicLDS(*GV))
+ LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
+ else
+ LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
+ FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
+ } else {
+ if (AMDGPU::isDynamicLDS(*GV))
+ LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
+ else
+ LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
+ }
+ }
+ }
+ };
+
+ PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
+ PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
+
+ // Get address sanitizer scale.
+ initAsanInfo();
+
+ for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
+ Function *Func = K.first;
+ auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
+ if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
+ LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
+ LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
+ Changed = false;
+ } else {
+ removeFnAttrFromReachable(CG, Func,
+ {"amdgpu-no-workitem-id-x",
+ "amdgpu-no-workitem-id-y",
+ "amdgpu-no-workitem-id-z"});
+ reorderStaticDynamicIndirectLDSSet(LDSParams);
+ buildSwLDSGlobal(Func);
+ buildSwDynLDSGlobal(Func);
+ populateSwMetadataGlobal(Func);
+ populateSwLDSAttributeAndMetadata(Func);
+ populateLDSToReplacementIndicesMap(Func);
+ DomTreeUpdater DTU(DTCallback(*Func),
+ DomTreeUpdater::UpdateStrategy::Lazy);
+ lowerKernelLDSAccesses(Func, DTU);
+ Changed = true;
+ }
+ }
+
+ // Get the Uses of LDS from non-kernels.
+ getUsesOfLDSByNonKernels();
+
+ // Get non-kernels with LDS ptr as argument and called by kernels.
+ getNonKernelsWithLDSArguments(CG);
+
+ if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
+ !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
+ NonKernelLDSParameters NKLDSParams;
+ NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
+ FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
+ NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
+ FuncLDSAccessInfo.AllNonKernelLDSAccess);
+ buildNonKernelLDSBaseTable(NKLDSParams);
+ buildNonKernelLDSOffsetTable(NKLDSParams);
+ for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
+ Function *Func = K.first;
+ DenseSet<GlobalVariable *> &LDSGlobals = K.second;
+ SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
+ std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
+ lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
+ }
+ for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
+ auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
+ if (K.find(Func) != K.end())
+ continue;
+ SetVector<llvm::GlobalVariable *> Vec;
+ lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
+ }
+ Changed = true;
+ }
+
+ if (!Changed)
+ return Changed;
+
+ for (auto &GV : make_early_inc_range(M.globals())) {
+ if (AMDGPU::isLDSVariableToLower(GV)) {
+ // probably want to remove from used lists
+ GV.removeDeadConstantUsers();
+ if (GV.use_empty())
+ GV.eraseFromParent();
+ }
+ }
+
+ if (AsanInstrumentLDS) {
+ SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+ for (Instruction *Inst : AsanInfo.Instructions) {
+ SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
+ getInterestingMemoryOperands(M, Inst, InterestingOperands);
+ for (auto &Operand : InterestingOperands) {
+ OperandsToInstrument.push_back(Operand);
+ }
+ }
+ for (auto &Operand : OperandsToInstrument) {
+ Value *Addr = Operand.getPtr();
+ instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
+ Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
+ Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
+ AsanInfo.Offset);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+class AMDGPUSwLowerLDSLegacy : public ModulePass {
+public:
+ const AMDGPUTargetMachine *AMDGPUTM;
+ static char ID;
+ AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
+ : ModulePass(ID), AMDGPUTM(TM) {
+ initializeAMDGPUSwLowerLDSLegacyPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+};
+} // namespace
+
+char AMDGPUSwLowerLDSLegacy::ID = 0;
+char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
+ "AMDGPU Software lowering of LDS", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
+ "AMDGPU Software lowering of LDS", false, false)
+
+bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
+ // AddressSanitizer pass adds "nosanitize_address" module flag if it has
+ // instrumented the IR. Return early if the flag is not present.
+ if (!M.getModuleFlag("nosanitize_address"))
+ return false;
+ DominatorTreeWrapperPass *const DTW =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
+ return DTW ? &DTW->getDomTree() : nullptr;
+ };
+ if (!AMDGPUTM) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
+ }
+ AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
+ bool IsChanged = SwLowerLDSImpl.run();
+ return IsChanged;
+}
+
+ModulePass *
+llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) {
+ return new AMDGPUSwLowerLDSLegacy(TM);
+}
+
+PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ // AddressSanitizer pass adds "nosanitize_address" module flag if it has
+ // instrumented the IR. Return early if the flag is not present.
+ if (!M.getModuleFlag("nosanitize_address"))
+ return PreservedAnalyses::all();
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
+ return &FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
+ bool IsChanged = SwLowerLDSImpl.run();
+ if (!IsChanged)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7ac7b3315bb972..49beb03488e7c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -412,6 +412,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 9ed7981b3da5ae..e7aa97bb1e5282 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
new file mode 100644
index 00000000000000..2776b9187724c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
@@ -0,0 +1,260 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check indirect dynamic LDS access through a non-kernel from kernel is lowered correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP47]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP47]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP19]]
+; CHECK-NEXT: [[TMP24:%.*]] = and i1 [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP24]])
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[TMP26]], label [[ASAN_REPORT:%.*]], label [[TMP29:%.*]], !prof [[PROF3:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP24]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK: 27:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP47]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP28]]
+; CHECK: 28:
+; CHECK-NEXT: br label [[TMP29]]
+; CHECK: 29:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 2147450880
+; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP35]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne i8 [[TMP36]], 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP36]]
+; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]])
+; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0
+; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT1:%.*]], label [[TMP46:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[TMP45:%.*]]
+; CHECK: 44:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP45]]
+; CHECK: 45:
+; CHECK-NEXT: br label [[TMP46]]
+; CHECK: 46:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP27]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP35:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP35]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP36]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP37]], i64 24)
+; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP73:%.*]] = ptrtoint ptr addrspace(1) [[TMP53]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP73]], i64 31)
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP75:%.*]] = ptrtoint ptr addrspace(1) [[TMP74]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP75]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP29]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = ptrtoint ptr addrspace(1) [[TMP39]] to i64
+; CHECK-NEXT: [[TMP41:%.*]] = lshr i64 [[TMP40]], 3
+; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 2147450880
+; CHECK-NEXT: [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
+; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
+; CHECK-NEXT: [[TMP45:%.*]] = icmp ne i8 [[TMP44]], 0
+; CHECK-NEXT: [[TMP46:%.*]] = and i64 [[TMP40]], 7
+; CHECK-NEXT: [[TMP47:%.*]] = trunc i64 [[TMP46]] to i8
+; CHECK-NEXT: [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP44]]
+; CHECK-NEXT: [[TMP49:%.*]] = and i1 [[TMP45]], [[TMP48]]
+; CHECK-NEXT: [[TMP50:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP49]])
+; CHECK-NEXT: [[TMP51:%.*]] = icmp ne i64 [[TMP50]], 0
+; CHECK-NEXT: br i1 [[TMP51]], label [[ASAN_REPORT:%.*]], label [[TMP54:%.*]], !prof [[PROF3]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP49]], label [[TMP52:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 52:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP40]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 53:
+; CHECK-NEXT: br label [[TMP54]]
+; CHECK: 54:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP39]], align 1
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(3) [[TMP30]] to i32
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP55]]
+; CHECK-NEXT: [[TMP57:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[TMP57]], 3
+; CHECK-NEXT: [[TMP90:%.*]] = inttoptr i64 [[TMP63]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP91:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: [[TMP58:%.*]] = lshr i64 [[TMP91]], 3
+; CHECK-NEXT: [[TMP59:%.*]] = add i64 [[TMP58]], 2147450880
+; CHECK-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[TMP60]], align 1
+; CHECK-NEXT: [[TMP62:%.*]] = icmp ne i8 [[TMP61]], 0
+; CHECK-NEXT: [[TMP64:%.*]] = and i64 [[TMP91]], 7
+; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i8
+; CHECK-NEXT: [[TMP66:%.*]] = icmp sge i8 [[TMP65]], [[TMP61]]
+; CHECK-NEXT: [[TMP67:%.*]] = and i1 [[TMP62]], [[TMP66]]
+; CHECK-NEXT: [[TMP68:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP67]])
+; CHECK-NEXT: [[TMP69:%.*]] = icmp ne i64 [[TMP68]], 0
+; CHECK-NEXT: br i1 [[TMP69]], label [[ASAN_REPORT1:%.*]], label [[TMP72:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP67]], label [[TMP70:%.*]], label [[TMP71:%.*]]
+; CHECK: 72:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP91]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP71]]
+; CHECK: 73:
+; CHECK-NEXT: br label [[TMP72]]
+; CHECK: 74:
+; CHECK-NEXT: [[TMP92:%.*]] = ptrtoint ptr addrspace(1) [[TMP90]] to i64
+; CHECK-NEXT: [[TMP76:%.*]] = lshr i64 [[TMP92]], 3
+; CHECK-NEXT: [[TMP77:%.*]] = add i64 [[TMP76]], 2147450880
+; CHECK-NEXT: [[TMP78:%.*]] = inttoptr i64 [[TMP77]] to ptr
+; CHECK-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
+; CHECK-NEXT: [[TMP80:%.*]] = icmp ne i8 [[TMP79]], 0
+; CHECK-NEXT: [[TMP81:%.*]] = and i64 [[TMP92]], 7
+; CHECK-NEXT: [[TMP82:%.*]] = trunc i64 [[TMP81]] to i8
+; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i8 [[TMP82]], [[TMP79]]
+; CHECK-NEXT: [[TMP84:%.*]] = and i1 [[TMP80]], [[TMP83]]
+; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP84]])
+; CHECK-NEXT: [[TMP86:%.*]] = icmp ne i64 [[TMP85]], 0
+; CHECK-NEXT: br i1 [[TMP86]], label [[ASAN_REPORT2:%.*]], label [[TMP89:%.*]], !prof [[PROF3]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP84]], label [[TMP87:%.*]], label [[TMP88:%.*]]
+; CHECK: 87:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP92]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP88]]
+; CHECK: 88:
+; CHECK-NEXT: br label [[TMP89]]
+; CHECK: 89:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP56]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP32:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP34]], i64 [[TMP33]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR7]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META4]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..8cbeb80d623351
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check indirect dynamic LDS access through a non-kernel from kernel is lowered correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP27]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP35:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP35]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP36]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP37]], i64 24)
+; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP73:%.*]] = ptrtoint ptr addrspace(1) [[TMP53]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP73]], i64 31)
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP75:%.*]] = ptrtoint ptr addrspace(1) [[TMP74]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP75]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP29]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP38]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP39]], align 1
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(3) [[TMP30]] to i32
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP55]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP56]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP32:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP34]], i64 [[TMP33]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
new file mode 100644
index 00000000000000..f33b30119754f2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if direct access of dynamic LDS in kernel is lowered correctly.
+ at lds_1 = external addrspace(3) global [0 x i8]
+ at lds_2 = external addrspace(3) global [0 x i8]
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 1, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 1, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i64 15
+; CHECK-NEXT: store i32 [[TMP24]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(4) [[TMP18]], align 4
+; CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = udiv i32 [[TMP14]], 1
+; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 1
+; CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP24]], [[TMP16]]
+; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP21]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP44:%.*]] = ptrtoint ptr addrspace(1) [[TMP42]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP44]], i64 24)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP28:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(3) [[TMP11]] to i32
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP28]], i32 [[TMP45]]
+; CHECK-NEXT: [[TMP29:%.*]] = ptrtoint ptr addrspace(1) [[TMP46]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = lshr i64 [[TMP29]], 3
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], 2147450880
+; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp ne i8 [[TMP33]], 0
+; CHECK-NEXT: [[TMP35:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i8
+; CHECK-NEXT: [[TMP37:%.*]] = icmp sge i8 [[TMP36]], [[TMP33]]
+; CHECK-NEXT: [[TMP38:%.*]] = and i1 [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP38]])
+; CHECK-NEXT: [[TMP40:%.*]] = icmp ne i64 [[TMP39]], 0
+; CHECK-NEXT: br i1 [[TMP40]], label [[ASAN_REPORT:%.*]], label [[TMP43:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP38]], label [[TMP41:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 41:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP29]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 42:
+; CHECK-NEXT: br label [[TMP43]]
+; CHECK: 43:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP46]], align 4
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP28]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ ;store i8 8, ptr addrspace(3) @lds_2, align 8
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR6]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
new file mode 100644
index 00000000000000..5e90eb0b95219e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if direct access of dynamic LDS in kernel is lowered correctly.
+ at lds_1 = external addrspace(3) global [0 x i8]
+ at lds_2 = external addrspace(3) global [0 x i8]
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 1, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 1, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP20]], i64 15
+; CHECK-NEXT: store i32 [[TMP24]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(4) [[TMP18]], align 4
+; CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = udiv i32 [[TMP14]], 1
+; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 1
+; CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP24]], [[TMP16]]
+; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP21]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP44:%.*]] = ptrtoint ptr addrspace(1) [[TMP42]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP44]], i64 24)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP28:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(3) [[TMP11]] to i32
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP28]], i32 [[TMP45]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP46]], align 4
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP28]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ ;store i8 8, ptr addrspace(3) @lds_2, align 8
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access-asan.ll
new file mode 100644
index 00000000000000..91e0a9fc5018be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access-asan.ll
@@ -0,0 +1,385 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check when multiple kernels access the same non-kernel, LDS accesses are lowere correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+define void @use_variables_1() sanitize_address {
+; CHECK-LABEL: define void @use_variables_1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP47]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP47]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP19]]
+; CHECK-NEXT: [[TMP24:%.*]] = and i1 [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP24]])
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[TMP26]], label [[ASAN_REPORT:%.*]], label [[TMP29:%.*]], !prof [[PROF3:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP24]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK: 27:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP47]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP28]]
+; CHECK: 28:
+; CHECK-NEXT: br label [[TMP29]]
+; CHECK: 29:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 2147450880
+; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP35]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne i8 [[TMP36]], 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP36]]
+; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]])
+; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0
+; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT1:%.*]], label [[TMP46:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[TMP45:%.*]]
+; CHECK: 44:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP45]]
+; CHECK: 45:
+; CHECK-NEXT: br label [[TMP46]]
+; CHECK: 46:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define void @use_variables_2() sanitize_address {
+; CHECK-LABEL: define void @use_variables_2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP48:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP48]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP48]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP19]]
+; CHECK-NEXT: [[TMP24:%.*]] = and i1 [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP24]])
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[TMP26]], label [[ASAN_REPORT:%.*]], label [[TMP29:%.*]], !prof [[PROF3]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP24]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK: 27:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP48]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP28]]
+; CHECK: 28:
+; CHECK-NEXT: br label [[TMP29]]
+; CHECK: 29:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP14]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP49:%.*]] = inttoptr i64 [[TMP38]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP65:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP65]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 2147450880
+; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP35]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne i8 [[TMP36]], 0
+; CHECK-NEXT: [[TMP39:%.*]] = and i64 [[TMP65]], 7
+; CHECK-NEXT: [[TMP40:%.*]] = trunc i64 [[TMP39]] to i8
+; CHECK-NEXT: [[TMP41:%.*]] = icmp sge i8 [[TMP40]], [[TMP36]]
+; CHECK-NEXT: [[TMP42:%.*]] = and i1 [[TMP37]], [[TMP41]]
+; CHECK-NEXT: [[TMP43:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP42]])
+; CHECK-NEXT: [[TMP44:%.*]] = icmp ne i64 [[TMP43]], 0
+; CHECK-NEXT: br i1 [[TMP44]], label [[ASAN_REPORT1:%.*]], label [[TMP47:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP42]], label [[TMP45:%.*]], label [[TMP46:%.*]]
+; CHECK: 47:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP65]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP46]]
+; CHECK: 48:
+; CHECK-NEXT: br label [[TMP47]]
+; CHECK: 49:
+; CHECK-NEXT: [[TMP50:%.*]] = ptrtoint ptr addrspace(1) [[TMP49]] to i64
+; CHECK-NEXT: [[TMP51:%.*]] = lshr i64 [[TMP50]], 3
+; CHECK-NEXT: [[TMP52:%.*]] = add i64 [[TMP51]], 2147450880
+; CHECK-NEXT: [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+; CHECK-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
+; CHECK-NEXT: [[TMP55:%.*]] = icmp ne i8 [[TMP54]], 0
+; CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP50]], 7
+; CHECK-NEXT: [[TMP57:%.*]] = trunc i64 [[TMP56]] to i8
+; CHECK-NEXT: [[TMP58:%.*]] = icmp sge i8 [[TMP57]], [[TMP54]]
+; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP55]], [[TMP58]]
+; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]])
+; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0
+; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT2:%.*]], label [[TMP64:%.*]], !prof [[PROF3]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]]
+; CHECK: 62:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP50]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP63]]
+; CHECK: 63:
+; CHECK-NEXT: br label [[TMP64]]
+; CHECK: 64:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP31]], align 2
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP25]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP25]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP26]] to i64
+; CHECK-NEXT: [[TMP28:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP28]] to i64
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP27]], i64 [[TMP33]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP51:%.*]] = ptrtoint ptr addrspace(1) [[TMP49]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP51]], i64 24)
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP53:%.*]] = ptrtoint ptr addrspace(1) [[TMP52]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP53]], i64 31)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 30:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32
+; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP29]], i32 [[TMP34]]
+; CHECK-NEXT: [[TMP36:%.*]] = ptrtoint ptr addrspace(1) [[TMP35]] to i64
+; CHECK-NEXT: [[TMP37:%.*]] = lshr i64 [[TMP36]], 3
+; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], 2147450880
+; CHECK-NEXT: [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to ptr
+; CHECK-NEXT: [[TMP40:%.*]] = load i8, ptr [[TMP39]], align 1
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne i8 [[TMP40]], 0
+; CHECK-NEXT: [[TMP42:%.*]] = and i64 [[TMP36]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP42]] to i8
+; CHECK-NEXT: [[TMP44:%.*]] = icmp sge i8 [[TMP43]], [[TMP40]]
+; CHECK-NEXT: [[TMP45:%.*]] = and i1 [[TMP41]], [[TMP44]]
+; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP45]])
+; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP46]], 0
+; CHECK-NEXT: br i1 [[TMP47]], label [[ASAN_REPORT:%.*]], label [[TMP50:%.*]], !prof [[PROF3]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP45]], label [[TMP48:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 48:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP36]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 49:
+; CHECK-NEXT: br label [[TMP50]]
+; CHECK: 50:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP35]], align 1
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP30:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[TMP30]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP32]], i64 [[TMP31]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ ret void
+}
+
+define amdgpu_kernel void @k1() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP30]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP33]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP8]], [[TMP19]]
+; CHECK-NEXT: store i32 [[TMP20]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = udiv i32 [[TMP22]], 8
+; CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-NEXT: store i32 [[TMP24]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP20]], [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[TMP27]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP28]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP34]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP53:%.*]] = ptrtoint ptr addrspace(1) [[TMP51]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP53]], i64 24)
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(1) [[TMP54]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP55]], i64 31)
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68
+; CHECK-NEXT: [[TMP57:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP57]], i64 28)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP31]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k1.dynlds) ]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: call void @use_variables_2()
+; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr addrspace(3) [[TMP32]] to i32
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP29]], i32 [[TMP58]]
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(1) [[TMP59]] to i64
+; CHECK-NEXT: [[TMP39:%.*]] = lshr i64 [[TMP38]], 3
+; CHECK-NEXT: [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880
+; CHECK-NEXT: [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
+; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1
+; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0
+; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[TMP38]], 7
+; CHECK-NEXT: [[TMP45:%.*]] = trunc i64 [[TMP44]] to i8
+; CHECK-NEXT: [[TMP46:%.*]] = icmp sge i8 [[TMP45]], [[TMP42]]
+; CHECK-NEXT: [[TMP47:%.*]] = and i1 [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP47]])
+; CHECK-NEXT: [[TMP49:%.*]] = icmp ne i64 [[TMP48]], 0
+; CHECK-NEXT: br i1 [[TMP49]], label [[ASAN_REPORT:%.*]], label [[TMP52:%.*]], !prof [[PROF3]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP47]], label [[TMP50:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 50:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 51:
+; CHECK-NEXT: br label [[TMP52]]
+; CHECK: 52:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP59]], align 4
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP35:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[TMP35]] to i64
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP37]], i64 [[TMP36]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ call void @use_variables_2()
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META4]] = !{i32 0}
+; CHECK: [[META5]] = !{i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..d0caddb7934a72
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check when multiple kernels access the same non-kernel, LDS accesses are lowere correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+define void @use_variables_1() sanitize_address {
+; CHECK-LABEL: define void @use_variables_1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define void @use_variables_2() sanitize_address {
+; CHECK-LABEL: define void @use_variables_2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x [4 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP14]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP31]], align 2
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP25]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP25]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP26]] to i64
+; CHECK-NEXT: [[TMP28:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP28]] to i64
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP27]], i64 [[TMP33]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP51:%.*]] = ptrtoint ptr addrspace(1) [[TMP49]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP51]], i64 24)
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP53:%.*]] = ptrtoint ptr addrspace(1) [[TMP52]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP53]], i64 31)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 30:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32
+; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP29]], i32 [[TMP34]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP35]], align 1
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP30:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[TMP30]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP32]], i64 [[TMP31]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ ret void
+}
+
+define amdgpu_kernel void @k1() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP30]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP33]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP8]], [[TMP19]]
+; CHECK-NEXT: store i32 [[TMP20]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = udiv i32 [[TMP22]], 8
+; CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-NEXT: store i32 [[TMP24]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP20]], [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr [[TMP27]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP28]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP34]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP53:%.*]] = ptrtoint ptr addrspace(1) [[TMP51]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP53]], i64 24)
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(1) [[TMP54]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP55]], i64 31)
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68
+; CHECK-NEXT: [[TMP57:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP57]], i64 28)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP31]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k1.dynlds) ]
+; CHECK-NEXT: call void @use_variables_1()
+; CHECK-NEXT: call void @use_variables_2()
+; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr addrspace(3) [[TMP32]] to i32
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP29]], i32 [[TMP58]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP59]], align 4
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP35:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP36:%.*]] = ptrtoint ptr [[TMP35]] to i64
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP37]], i64 [[TMP36]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables_1()
+ call void @use_variables_2()
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META3]] = !{i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
new file mode 100644
index 00000000000000..07baf90e370d19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check malloc and free blocks are placed correctly when multiple
+; blocks and branching is present in the function with LDS accesses lowered correctly.
+
+ at lds_1 = internal addrspace(3) global i32 poison
+ at lds_2 = internal addrspace(3) global i32 poison
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.test_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.test_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.test_kernel.md.type { %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @test_kernel() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @test_kernel(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP18]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP14]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP17]], i64 [[TMP19]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP20]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP28]], i64 24)
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 36
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP30]], i64 28)
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP32]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP25]]
+; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(3) [[TMP11]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP26]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(1) [[TMP13]], align 4
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[POSITIVE:%.*]], label [[NEGATIVE:%.*]]
+; CHECK: positive:
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: negative:
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[VAL1]], 0
+; CHECK-NEXT: br i1 [[CMP2]], label [[VAL1_POSITIVE:%.*]], label [[VAL1_NEGATIVE:%.*]]
+; CHECK: val1_positive:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: val1_negative:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP24]], i64 [[TMP23]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+%val1 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_1 to ptr addrspace(1))
+%val2 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_2 to ptr addrspace(1))
+
+%result = add i32 %val1, %val2
+%cmp = icmp sgt i32 %result, 0
+br i1 %cmp, label %positive, label %negative
+
+positive:
+ret void
+
+negative:
+%cmp2 = icmp sgt i32 %val1, 0
+br i1 %cmp2, label %val1_positive, label %val1_negative
+
+val1_positive:
+ret void
+
+val1_negative:
+ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
new file mode 100644
index 00000000000000..6848e2c06c1e1e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check malloc and free blocks are placed correctly when multiple
+; blocks and branching is present in the function with LDS accesses lowered correctly.
+
+ at lds_1 = internal addrspace(3) global i32 poison
+ at lds_2 = internal addrspace(3) global i32 poison
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.test_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.test_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.test_kernel.md.type { %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.test_kernel.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @test_kernel() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @test_kernel(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP18]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP14]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP17]], i64 [[TMP19]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP20]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP28]], i64 24)
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 36
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP30]], i64 28)
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP32]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_TEST_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.test_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.test_kernel, i32 [[TMP25]]
+; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(3) [[TMP11]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP26]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(1) [[TMP13]], align 4
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[RESULT]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[POSITIVE:%.*]], label [[NEGATIVE:%.*]]
+; CHECK: positive:
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: negative:
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[VAL1]], 0
+; CHECK-NEXT: br i1 [[CMP2]], label [[VAL1_POSITIVE:%.*]], label [[VAL1_NEGATIVE:%.*]]
+; CHECK: val1_positive:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: val1_negative:
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP24]], i64 [[TMP23]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+%val1 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_1 to ptr addrspace(1))
+%val2 = load i32, ptr addrspace(1) addrspacecast (ptr addrspace(3) @lds_2 to ptr addrspace(1))
+
+%result = add i32 %val1, %val2
+%cmp = icmp sgt i32 %result, 0
+br i1 %cmp, label %positive, label %negative
+
+positive:
+ret void
+
+negative:
+%cmp2 = icmp sgt i32 %val1, 0
+br i1 %cmp2, label %val1_positive, label %val1_negative
+
+val1_positive:
+ret void
+
+val1_negative:
+ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
new file mode 100644
index 00000000000000..40b1305a3b12c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly when a non-kernel
+; is called from kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP47]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP47]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP19]]
+; CHECK-NEXT: [[TMP24:%.*]] = and i1 [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP24]])
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[TMP26]], label [[ASAN_REPORT:%.*]], label [[TMP29:%.*]], !prof [[PROF3:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP24]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK: 27:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP47]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP28]]
+; CHECK: 28:
+; CHECK-NEXT: br label [[TMP29]]
+; CHECK: 29:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 2147450880
+; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP35]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne i8 [[TMP36]], 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP36]]
+; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]])
+; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0
+; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT1:%.*]], label [[TMP46:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[TMP45:%.*]]
+; CHECK: 44:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP45]]
+; CHECK: 45:
+; CHECK-NEXT: br label [[TMP46]]
+; CHECK: 46:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP27]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP35:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP35]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP36]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP37]], i64 24)
+; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP73:%.*]] = ptrtoint ptr addrspace(1) [[TMP53]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP73]], i64 31)
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP75:%.*]] = ptrtoint ptr addrspace(1) [[TMP74]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP75]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP29]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = ptrtoint ptr addrspace(1) [[TMP39]] to i64
+; CHECK-NEXT: [[TMP41:%.*]] = lshr i64 [[TMP40]], 3
+; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 2147450880
+; CHECK-NEXT: [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
+; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
+; CHECK-NEXT: [[TMP45:%.*]] = icmp ne i8 [[TMP44]], 0
+; CHECK-NEXT: [[TMP46:%.*]] = and i64 [[TMP40]], 7
+; CHECK-NEXT: [[TMP47:%.*]] = trunc i64 [[TMP46]] to i8
+; CHECK-NEXT: [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP44]]
+; CHECK-NEXT: [[TMP49:%.*]] = and i1 [[TMP45]], [[TMP48]]
+; CHECK-NEXT: [[TMP50:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP49]])
+; CHECK-NEXT: [[TMP51:%.*]] = icmp ne i64 [[TMP50]], 0
+; CHECK-NEXT: br i1 [[TMP51]], label [[ASAN_REPORT:%.*]], label [[TMP54:%.*]], !prof [[PROF3]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP49]], label [[TMP52:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 52:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP40]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 53:
+; CHECK-NEXT: br label [[TMP54]]
+; CHECK: 54:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP39]], align 1
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(3) [[TMP30]] to i32
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP55]]
+; CHECK-NEXT: [[TMP57:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[TMP57]], 3
+; CHECK-NEXT: [[TMP90:%.*]] = inttoptr i64 [[TMP63]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP91:%.*]] = ptrtoint ptr addrspace(1) [[TMP56]] to i64
+; CHECK-NEXT: [[TMP58:%.*]] = lshr i64 [[TMP91]], 3
+; CHECK-NEXT: [[TMP59:%.*]] = add i64 [[TMP58]], 2147450880
+; CHECK-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[TMP60]], align 1
+; CHECK-NEXT: [[TMP62:%.*]] = icmp ne i8 [[TMP61]], 0
+; CHECK-NEXT: [[TMP64:%.*]] = and i64 [[TMP91]], 7
+; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i8
+; CHECK-NEXT: [[TMP66:%.*]] = icmp sge i8 [[TMP65]], [[TMP61]]
+; CHECK-NEXT: [[TMP67:%.*]] = and i1 [[TMP62]], [[TMP66]]
+; CHECK-NEXT: [[TMP68:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP67]])
+; CHECK-NEXT: [[TMP69:%.*]] = icmp ne i64 [[TMP68]], 0
+; CHECK-NEXT: br i1 [[TMP69]], label [[ASAN_REPORT1:%.*]], label [[TMP72:%.*]], !prof [[PROF3]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP67]], label [[TMP70:%.*]], label [[TMP71:%.*]]
+; CHECK: 72:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP91]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP71]]
+; CHECK: 73:
+; CHECK-NEXT: br label [[TMP72]]
+; CHECK: 74:
+; CHECK-NEXT: [[TMP92:%.*]] = ptrtoint ptr addrspace(1) [[TMP90]] to i64
+; CHECK-NEXT: [[TMP76:%.*]] = lshr i64 [[TMP92]], 3
+; CHECK-NEXT: [[TMP77:%.*]] = add i64 [[TMP76]], 2147450880
+; CHECK-NEXT: [[TMP78:%.*]] = inttoptr i64 [[TMP77]] to ptr
+; CHECK-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
+; CHECK-NEXT: [[TMP80:%.*]] = icmp ne i8 [[TMP79]], 0
+; CHECK-NEXT: [[TMP81:%.*]] = and i64 [[TMP92]], 7
+; CHECK-NEXT: [[TMP82:%.*]] = trunc i64 [[TMP81]] to i8
+; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i8 [[TMP82]], [[TMP79]]
+; CHECK-NEXT: [[TMP84:%.*]] = and i1 [[TMP80]], [[TMP83]]
+; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP84]])
+; CHECK-NEXT: [[TMP86:%.*]] = icmp ne i64 [[TMP85]], 0
+; CHECK-NEXT: br i1 [[TMP86]], label [[ASAN_REPORT2:%.*]], label [[TMP89:%.*]], !prof [[PROF3]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP84]], label [[TMP87:%.*]], label [[TMP88:%.*]]
+; CHECK: 87:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP92]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP88]]
+; CHECK: 88:
+; CHECK-NEXT: br label [[TMP89]]
+; CHECK: 89:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP56]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP32:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP34]], i64 [[TMP33]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR7]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META4]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
new file mode 100644
index 00000000000000..0cc49c94e22793
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly when a non-kernel
+; is called from kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP30]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds_3, align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP27]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP35:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP26]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP35]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr addrspace(1) [[TMP36]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP37]], i64 24)
+; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP73:%.*]] = ptrtoint ptr addrspace(1) [[TMP53]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP73]], i64 31)
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP75:%.*]] = ptrtoint ptr addrspace(1) [[TMP74]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP75]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP29]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP38]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP39]], align 1
+; CHECK-NEXT: [[TMP55:%.*]] = ptrtoint ptr addrspace(3) [[TMP30]] to i32
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP31]], i32 [[TMP55]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP56]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP32:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP34]], i64 [[TMP33]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[META2]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
new file mode 100644
index 00000000000000..f2cdc4c812db18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly in kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP31]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP31]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP32]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP39:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP30]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP39]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP41:%.*]] = ptrtoint ptr addrspace(1) [[TMP40]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP41]], i64 24)
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP111:%.*]] = ptrtoint ptr addrspace(1) [[TMP57]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP111]], i64 31)
+; CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP113:%.*]] = ptrtoint ptr addrspace(1) [[TMP112]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP113]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP33]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: [[TMP42:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP42]]
+; CHECK-NEXT: [[TMP44:%.*]] = ptrtoint ptr addrspace(1) [[TMP43]] to i64
+; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP44]], 3
+; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 2147450880
+; CHECK-NEXT: [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr
+; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[TMP47]], align 1
+; CHECK-NEXT: [[TMP49:%.*]] = icmp ne i8 [[TMP48]], 0
+; CHECK-NEXT: [[TMP50:%.*]] = and i64 [[TMP44]], 7
+; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i8
+; CHECK-NEXT: [[TMP52:%.*]] = icmp sge i8 [[TMP51]], [[TMP48]]
+; CHECK-NEXT: [[TMP53:%.*]] = and i1 [[TMP49]], [[TMP52]]
+; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP53]])
+; CHECK-NEXT: [[TMP55:%.*]] = icmp ne i64 [[TMP54]], 0
+; CHECK-NEXT: br i1 [[TMP55]], label [[ASAN_REPORT:%.*]], label [[TMP58:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP53]], label [[TMP56:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 56:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP44]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 57:
+; CHECK-NEXT: br label [[TMP58]]
+; CHECK: 58:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP43]], align 4
+; CHECK-NEXT: [[TMP59:%.*]] = ptrtoint ptr addrspace(3) [[TMP27]] to i32
+; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP59]]
+; CHECK-NEXT: [[TMP61:%.*]] = ptrtoint ptr addrspace(1) [[TMP60]] to i64
+; CHECK-NEXT: [[TMP62:%.*]] = lshr i64 [[TMP61]], 3
+; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[TMP62]], 2147450880
+; CHECK-NEXT: [[TMP64:%.*]] = inttoptr i64 [[TMP63]] to ptr
+; CHECK-NEXT: [[TMP65:%.*]] = load i8, ptr [[TMP64]], align 1
+; CHECK-NEXT: [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0
+; CHECK-NEXT: [[TMP67:%.*]] = and i64 [[TMP61]], 7
+; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[TMP67]], 3
+; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i8
+; CHECK-NEXT: [[TMP70:%.*]] = icmp sge i8 [[TMP69]], [[TMP65]]
+; CHECK-NEXT: [[TMP71:%.*]] = and i1 [[TMP66]], [[TMP70]]
+; CHECK-NEXT: [[TMP72:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP71]])
+; CHECK-NEXT: [[TMP73:%.*]] = icmp ne i64 [[TMP72]], 0
+; CHECK-NEXT: br i1 [[TMP73]], label [[ASAN_REPORT1:%.*]], label [[TMP76:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP71]], label [[TMP74:%.*]], label [[TMP75:%.*]]
+; CHECK: 74:
+; CHECK-NEXT: call void @__asan_report_store4(i64 [[TMP61]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP75]]
+; CHECK: 75:
+; CHECK-NEXT: br label [[TMP76]]
+; CHECK: 76:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP60]], align 8
+; CHECK-NEXT: [[TMP77:%.*]] = ptrtoint ptr addrspace(3) [[TMP29]] to i32
+; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP77]]
+; CHECK-NEXT: [[TMP79:%.*]] = ptrtoint ptr addrspace(1) [[TMP78]] to i64
+; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP79]], 3
+; CHECK-NEXT: [[TMP81:%.*]] = add i64 [[TMP80]], 2147450880
+; CHECK-NEXT: [[TMP82:%.*]] = inttoptr i64 [[TMP81]] to ptr
+; CHECK-NEXT: [[TMP83:%.*]] = load i8, ptr [[TMP82]], align 1
+; CHECK-NEXT: [[TMP84:%.*]] = icmp ne i8 [[TMP83]], 0
+; CHECK-NEXT: [[TMP85:%.*]] = and i64 [[TMP79]], 7
+; CHECK-NEXT: [[TMP86:%.*]] = trunc i64 [[TMP85]] to i8
+; CHECK-NEXT: [[TMP87:%.*]] = icmp sge i8 [[TMP86]], [[TMP83]]
+; CHECK-NEXT: [[TMP88:%.*]] = and i1 [[TMP84]], [[TMP87]]
+; CHECK-NEXT: [[TMP89:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP88]])
+; CHECK-NEXT: [[TMP90:%.*]] = icmp ne i64 [[TMP89]], 0
+; CHECK-NEXT: br i1 [[TMP90]], label [[ASAN_REPORT2:%.*]], label [[TMP93:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP88]], label [[TMP91:%.*]], label [[TMP92:%.*]]
+; CHECK: 91:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP79]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP92]]
+; CHECK: 92:
+; CHECK-NEXT: br label [[TMP93]]
+; CHECK: 93:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP78]], align 4
+; CHECK-NEXT: [[TMP94:%.*]] = ptrtoint ptr addrspace(3) [[TMP34]] to i32
+; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP94]]
+; CHECK-NEXT: [[TMP96:%.*]] = ptrtoint ptr addrspace(1) [[TMP95]] to i64
+; CHECK-NEXT: [[TMP97:%.*]] = lshr i64 [[TMP96]], 3
+; CHECK-NEXT: [[TMP98:%.*]] = add i64 [[TMP97]], 2147450880
+; CHECK-NEXT: [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
+; CHECK-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-NEXT: [[TMP101:%.*]] = icmp ne i8 [[TMP100]], 0
+; CHECK-NEXT: [[TMP102:%.*]] = and i64 [[TMP96]], 7
+; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i8
+; CHECK-NEXT: [[TMP104:%.*]] = icmp sge i8 [[TMP103]], [[TMP100]]
+; CHECK-NEXT: [[TMP105:%.*]] = and i1 [[TMP101]], [[TMP104]]
+; CHECK-NEXT: [[TMP106:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP105]])
+; CHECK-NEXT: [[TMP107:%.*]] = icmp ne i64 [[TMP106]], 0
+; CHECK-NEXT: br i1 [[TMP107]], label [[ASAN_REPORT3:%.*]], label [[TMP110:%.*]], !prof [[PROF2]]
+; CHECK: asan.report3:
+; CHECK-NEXT: br i1 [[TMP105]], label [[TMP108:%.*]], label [[TMP109:%.*]]
+; CHECK: 108:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP96]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP109]]
+; CHECK: 109:
+; CHECK-NEXT: br label [[TMP110]]
+; CHECK: 110:
+; CHECK-NEXT: store i8 8, ptr addrspace(1) [[TMP95]], align 8
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP36:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(1) [[TMP35]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP38]], i64 [[TMP37]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 8
+ store i8 7, ptr addrspace(3) @lds_3, align 4
+ store i8 8, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR6]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
new file mode 100644
index 00000000000000..e0bfca0f63ca7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static and dynamic LDS accesses are lowered correctly in kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+ at lds_3 = external addrspace(3) global [0 x i8], align 4
+ at lds_4 = external addrspace(3) global [0 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.k0.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 8, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 0, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 0, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP21:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 1), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 7
+; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 8
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 8
+; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT: store i32 [[TMP31]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 1), align 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP31]], 7
+; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[TMP17]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 8
+; CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP32]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP22]] to i64
+; CHECK-NEXT: [[TMP39:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP30]], i64 [[TMP23]])
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP39]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8
+; CHECK-NEXT: [[TMP41:%.*]] = ptrtoint ptr addrspace(1) [[TMP40]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP41]], i64 24)
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 33
+; CHECK-NEXT: [[TMP111:%.*]] = ptrtoint ptr addrspace(1) [[TMP57]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP111]], i64 31)
+; CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 68
+; CHECK-NEXT: [[TMP113:%.*]] = ptrtoint ptr addrspace(1) [[TMP112]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP113]], i64 28)
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 32:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), align 4
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]]
+; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP33]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.k0.dynlds) ]
+; CHECK-NEXT: [[TMP42:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP42]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP43]], align 4
+; CHECK-NEXT: [[TMP59:%.*]] = ptrtoint ptr addrspace(3) [[TMP27]] to i32
+; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP59]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP60]], align 8
+; CHECK-NEXT: [[TMP77:%.*]] = ptrtoint ptr addrspace(3) [[TMP29]] to i32
+; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP77]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP78]], align 4
+; CHECK-NEXT: [[TMP94:%.*]] = ptrtoint ptr addrspace(3) [[TMP34]] to i32
+; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP35]], i32 [[TMP94]]
+; CHECK-NEXT: store i8 8, ptr addrspace(1) [[TMP95]], align 8
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP36:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr addrspace(1) [[TMP35]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP38]], i64 [[TMP37]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 8
+ store i8 7, ptr addrspace(3) @lds_3, align 4
+ store i8 8, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8,8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 9}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
new file mode 100644
index 00000000000000..3a05f93df35a30
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [3 x i8], align 4
+ at lds_4 = external addrspace(3) global [4 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880
+; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8
+; CHECK-NEXT: [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP20]]
+; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]])
+; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0
+; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]]
+; CHECK: 28:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP29]]
+; CHECK: 29:
+; CHECK-NEXT: br label [[TMP30]]
+; CHECK: 30:
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ %X = addrspacecast ptr addrspace(3) @lds_3 to ptr
+ store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP15]], i64 [[TMP24]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP26]], i64 24)
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 33
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP28]], i64 31)
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP45]], i64 28)
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 99
+; CHECK-NEXT: [[TMP66:%.*]] = ptrtoint ptr addrspace(1) [[TMP65]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP66]], i64 29)
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132
+; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 24:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP17]]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP11]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP31]] to i64
+; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 2147450880
+; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP35]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne i8 [[TMP36]], 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP36]]
+; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]])
+; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0
+; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 44:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 45:
+; CHECK-NEXT: br label [[TMP46]]
+; CHECK: 46:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1
+; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = ptrtoint ptr addrspace(1) [[TMP48]] to i64
+; CHECK-NEXT: [[TMP55:%.*]] = add i64 [[TMP49]], 3
+; CHECK-NEXT: [[TMP82:%.*]] = inttoptr i64 [[TMP55]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP83:%.*]] = ptrtoint ptr addrspace(1) [[TMP48]] to i64
+; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP83]], 3
+; CHECK-NEXT: [[TMP51:%.*]] = add i64 [[TMP50]], 2147450880
+; CHECK-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr
+; CHECK-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; CHECK-NEXT: [[TMP54:%.*]] = icmp ne i8 [[TMP53]], 0
+; CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP83]], 7
+; CHECK-NEXT: [[TMP57:%.*]] = trunc i64 [[TMP56]] to i8
+; CHECK-NEXT: [[TMP58:%.*]] = icmp sge i8 [[TMP57]], [[TMP53]]
+; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]]
+; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]])
+; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0
+; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]]
+; CHECK: 64:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP63]]
+; CHECK: 65:
+; CHECK-NEXT: br label [[TMP64]]
+; CHECK: 66:
+; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64
+; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3
+; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880
+; CHECK-NEXT: [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
+; CHECK-NEXT: [[TMP71:%.*]] = load i8, ptr [[TMP70]], align 1
+; CHECK-NEXT: [[TMP72:%.*]] = icmp ne i8 [[TMP71]], 0
+; CHECK-NEXT: [[TMP73:%.*]] = and i64 [[TMP84]], 7
+; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i8
+; CHECK-NEXT: [[TMP75:%.*]] = icmp sge i8 [[TMP74]], [[TMP71]]
+; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]]
+; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]])
+; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0
+; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]]
+; CHECK: 79:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP80]]
+; CHECK: 80:
+; CHECK-NEXT: br label [[TMP81]]
+; CHECK: 81:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR7]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META3]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
new file mode 100644
index 00000000000000..a70db2259cc3f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when LDS is passed as function
+; argument to non-kernel.
+
+ at lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.my_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.my_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.my_kernel.md.type { %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 32, i32 4096, i32 5120 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
+;.
+define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
+; CHECK-LABEL: define void @my_function(
+; CHECK-SAME: ptr addrspace(3) [[LDS_ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr addrspace(3) [[LDS_ARG]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr addrspace(1) [[TMP6]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP7]], 7
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i8
+; CHECK-NEXT: [[TMP16:%.*]] = icmp sge i8 [[TMP15]], [[TMP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP17]])
+; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
+; CHECK-NEXT: br i1 [[TMP19]], label [[ASAN_REPORT:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP17]], label [[TMP20:%.*]], label [[TMP21:%.*]]
+; CHECK: 20:
+; CHECK-NEXT: call void @__asan_report_load4(i64 [[TMP7]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP21]]
+; CHECK: 21:
+; CHECK-NEXT: br label [[TMP22]]
+; CHECK: 22:
+; CHECK-NEXT: [[LDS_VAL:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[NEW_LDS_VAL:%.*]] = add i32 [[LDS_VAL]], 1
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(3) [[LDS_ARG]] to i32
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP26]], 3
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 2147450880
+; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i8 [[TMP30]], 0
+; CHECK-NEXT: [[TMP32:%.*]] = and i64 [[TMP26]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[TMP33]] to i8
+; CHECK-NEXT: [[TMP35:%.*]] = icmp sge i8 [[TMP34]], [[TMP30]]
+; CHECK-NEXT: [[TMP36:%.*]] = and i1 [[TMP31]], [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP36]])
+; CHECK-NEXT: [[TMP38:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[TMP38]], label [[ASAN_REPORT1:%.*]], label [[TMP41:%.*]], !prof [[PROF1]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP40:%.*]]
+; CHECK: 39:
+; CHECK-NEXT: call void @__asan_report_store4(i64 [[TMP26]]) #[[ATTR7]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP40]]
+; CHECK: 40:
+; CHECK-NEXT: br label [[TMP41]]
+; CHECK: 41:
+; CHECK-NEXT: store i32 [[NEW_LDS_VAL]], ptr addrspace(1) [[TMP25]], align 4
+; CHECK-NEXT: ret void
+;
+ %lds_val = load i32, ptr addrspace(3) %lds_arg, align 4
+ %new_lds_val = add i32 %lds_val, 1
+ store i32 %new_lds_val, ptr addrspace(3) %lds_arg, align 4
+ ret void
+}
+
+define amdgpu_kernel void @my_kernel() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @my_kernel(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP13]], i64 [[TMP15]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24)
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 4128
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(1) [[TMP23]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP24]], i64 1024)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[LDS_PTR:%.*]] = getelementptr [1024 x i32], ptr addrspace(3) [[TMP9]], i32 0, i32 0
+; CHECK-NEXT: call void @my_function(ptr addrspace(3) [[LDS_PTR]])
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP18:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(1) [[TMP17]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP20]], i64 [[TMP19]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %lds_ptr = getelementptr [1024 x i32], ptr addrspace(3) @lds_var, i32 0, i32 0
+ call void @my_function(ptr addrspace(3) %lds_ptr)
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR7]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[META2]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
new file mode 100644
index 00000000000000..55a36f85dc73af
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when LDS is passed as function
+; argument to non-kernel.
+
+ at lds_var = internal addrspace(3) global [1024 x i32] poison, align 4
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.my_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.my_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.my_kernel.md.type { %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.my_kernel.md.item { i32 32, i32 4096, i32 5120 } }, no_sanitize_address
+; CHECK: @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel], no_sanitize_address
+;.
+define void @my_function(ptr addrspace(3) %lds_arg) sanitize_address {
+; CHECK-LABEL: define void @my_function(
+; CHECK-SAME: ptr addrspace(3) [[LDS_ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr addrspace(3) [[LDS_ARG]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP5]]
+; CHECK-NEXT: [[LDS_VAL:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[NEW_LDS_VAL:%.*]] = add i32 [[LDS_VAL]], 1
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(3) [[LDS_ARG]] to i32
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP24]]
+; CHECK-NEXT: store i32 [[NEW_LDS_VAL]], ptr addrspace(1) [[TMP25]], align 4
+; CHECK-NEXT: ret void
+;
+ %lds_val = load i32, ptr addrspace(3) %lds_arg, align 4
+ %new_lds_val = add i32 %lds_val, 1
+ store i32 %new_lds_val, ptr addrspace(3) %lds_arg, align 4
+ ret void
+}
+
+define amdgpu_kernel void @my_kernel() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @my_kernel(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP13]], i64 [[TMP15]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24)
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 4128
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(1) [[TMP23]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP24]], i64 1024)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_MY_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.my_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.my_kernel, i32 [[TMP8]]
+; CHECK-NEXT: [[LDS_PTR:%.*]] = getelementptr [1024 x i32], ptr addrspace(3) [[TMP9]], i32 0, i32 0
+; CHECK-NEXT: call void @my_function(ptr addrspace(3) [[LDS_PTR]])
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP18:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(1) [[TMP17]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP20]], i64 [[TMP19]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %lds_ptr = getelementptr [1024 x i32], ptr addrspace(3) @lds_var, i32 0, i32 0
+ call void @my_function(ptr addrspace(3) %lds_ptr)
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
new file mode 100644
index 00000000000000..1dd391ec6321a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel.
+
+ at A = external addrspace(3) global [8 x ptr]
+ at B = external addrspace(3) global [0 x i32]
+
+define amdgpu_kernel void @kernel_0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: call void @call_store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @call_store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_1() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ]
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_2() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2(
+; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_3() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_3(
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ]
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define private void @call_store_A() sanitize_address {
+; CHECK-LABEL: define private void @call_store_A(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define private void @store_A() sanitize_address {
+; CHECK-LABEL: define private void @store_A(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8
+; CHECK-NEXT: ret void
+;
+ store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null
+ ret void
+}
+
+define private ptr @get_B_ptr() sanitize_address {
+; CHECK-LABEL: define private ptr @get_B_ptr(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: ret ptr [[TMP10]]
+;
+ ret ptr addrspacecast (ptr addrspace(3) @B to ptr)
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META3]] = !{i32 1}
+; CHECK: [[META4]] = !{i32 2}
+; CHECK: [[META5]] = !{i32 3}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
new file mode 100644
index 00000000000000..ed9107764eb918
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel.
+
+ at A = external addrspace(3) global [8 x ptr]
+ at B = external addrspace(3) global [0 x i32]
+
+define amdgpu_kernel void @kernel_0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
+; CHECK-NEXT: call void @call_store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @call_store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_1() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ]
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_2() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2(
+; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96
+; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 18:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_3() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_3(
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15
+; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24)
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 23:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ]
+; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr()
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %ptr = call ptr @get_B_ptr()
+ ret void
+}
+
+define private void @call_store_A() sanitize_address {
+; CHECK-LABEL: define private void @call_store_A(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: call void @store_A()
+; CHECK-NEXT: ret void
+;
+ call void @store_A()
+ ret void
+}
+
+define private void @store_A() sanitize_address {
+; CHECK-LABEL: define private void @store_A(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8
+; CHECK-NEXT: ret void
+;
+ store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null
+ ret void
+}
+
+define private ptr @get_B_ptr() sanitize_address {
+; CHECK-LABEL: define private ptr @get_B_ptr(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: ret ptr [[TMP10]]
+;
+ ret ptr addrspacecast (ptr addrspace(3) @B to ptr)
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META3]] = !{i32 1}
+; CHECK: [[META4]] = !{i32 2}
+; CHECK: [[META5]] = !{i32 3}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
new file mode 100644
index 00000000000000..11e912287c7f7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 1
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 2
+ at lds_3 = external addrspace(3) global [3 x i8], align 4
+ at lds_4 = external addrspace(3) global [4 x i8], align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address
+;.
+define void @use_variables() sanitize_address {
+; CHECK-LABEL: define void @use_variables(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
+; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]]
+; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
+; CHECK-NEXT: ret void
+;
+ %X = addrspacecast ptr addrspace(3) @lds_3 to ptr
+ store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4
+ store i8 3, ptr addrspace(3) @lds_4, align 8
+ ret void
+}
+
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP15]], i64 [[TMP24]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP26]], i64 24)
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 33
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP28]], i64 31)
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(1) [[TMP29]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP45]], i64 28)
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 99
+; CHECK-NEXT: [[TMP66:%.*]] = ptrtoint ptr addrspace(1) [[TMP65]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP66]], i64 29)
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132
+; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 24:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP17]]
+; CHECK-NEXT: call void @use_variables()
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP11]] to i32
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP30]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1
+; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @use_variables()
+ store i8 7, ptr addrspace(3) @lds_1, align 1
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
new file mode 100644
index 00000000000000..301bda7e0086e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static LDS accesses in kernel are lowered correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP15]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP41:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP41]], i64 24)
+; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 33
+; CHECK-NEXT: [[TMP62:%.*]] = ptrtoint ptr addrspace(1) [[TMP61]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP62]], i64 31)
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr addrspace(1) [[TMP63]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP64]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP17]]
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64
+; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP28]], 3
+; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], 2147450880
+; CHECK-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP31]], align 1
+; CHECK-NEXT: [[TMP33:%.*]] = icmp ne i8 [[TMP32]], 0
+; CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP28]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[TMP34]] to i8
+; CHECK-NEXT: [[TMP36:%.*]] = icmp sge i8 [[TMP35]], [[TMP32]]
+; CHECK-NEXT: [[TMP37:%.*]] = and i1 [[TMP33]], [[TMP36]]
+; CHECK-NEXT: [[TMP38:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP37]])
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne i64 [[TMP38]], 0
+; CHECK-NEXT: br i1 [[TMP39]], label [[ASAN_REPORT:%.*]], label [[TMP42:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP37]], label [[TMP40:%.*]], label [[CONDFREE:%.*]]
+; CHECK: 40:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP28]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[CONDFREE]]
+; CHECK: 41:
+; CHECK-NEXT: br label [[TMP42]]
+; CHECK: 42:
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP27]], align 4
+; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr addrspace(3) [[TMP24]] to i32
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP43]]
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(1) [[TMP44]] to i64
+; CHECK-NEXT: [[TMP51:%.*]] = add i64 [[TMP45]], 3
+; CHECK-NEXT: [[TMP78:%.*]] = inttoptr i64 [[TMP51]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP79:%.*]] = ptrtoint ptr addrspace(1) [[TMP44]] to i64
+; CHECK-NEXT: [[TMP46:%.*]] = lshr i64 [[TMP79]], 3
+; CHECK-NEXT: [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880
+; CHECK-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
+; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; CHECK-NEXT: [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0
+; CHECK-NEXT: [[TMP52:%.*]] = and i64 [[TMP79]], 7
+; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8
+; CHECK-NEXT: [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]]
+; CHECK-NEXT: [[TMP55:%.*]] = and i1 [[TMP50]], [[TMP54]]
+; CHECK-NEXT: [[TMP56:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP55]])
+; CHECK-NEXT: [[TMP57:%.*]] = icmp ne i64 [[TMP56]], 0
+; CHECK-NEXT: br i1 [[TMP57]], label [[ASAN_REPORT1:%.*]], label [[TMP60:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP55]], label [[TMP58:%.*]], label [[TMP59:%.*]]
+; CHECK: 60:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP79]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP59]]
+; CHECK: 61:
+; CHECK-NEXT: br label [[TMP60]]
+; CHECK: 62:
+; CHECK-NEXT: [[TMP80:%.*]] = ptrtoint ptr addrspace(1) [[TMP78]] to i64
+; CHECK-NEXT: [[TMP81:%.*]] = lshr i64 [[TMP80]], 3
+; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[TMP81]], 2147450880
+; CHECK-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+; CHECK-NEXT: [[TMP67:%.*]] = load i8, ptr [[TMP66]], align 1
+; CHECK-NEXT: [[TMP68:%.*]] = icmp ne i8 [[TMP67]], 0
+; CHECK-NEXT: [[TMP69:%.*]] = and i64 [[TMP80]], 7
+; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i8
+; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i8 [[TMP70]], [[TMP67]]
+; CHECK-NEXT: [[TMP72:%.*]] = and i1 [[TMP68]], [[TMP71]]
+; CHECK-NEXT: [[TMP73:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP72]])
+; CHECK-NEXT: [[TMP74:%.*]] = icmp ne i64 [[TMP73]], 0
+; CHECK-NEXT: br i1 [[TMP74]], label [[ASAN_REPORT2:%.*]], label [[TMP77:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP72]], label [[TMP75:%.*]], label [[TMP76:%.*]]
+; CHECK: 75:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP80]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP76]]
+; CHECK: 76:
+; CHECK-NEXT: br label [[TMP77]]
+; CHECK: 77:
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP44]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR6]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
new file mode 100644
index 00000000000000..02a241f947748e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+ at lds_1 = internal addrspace(3) global [1 x i32] poison, align 4
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.atomic_xchg_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.atomic_xchg_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.atomic_xchg_kernel.md.type { %llvm.amdgcn.sw.lds.atomic_xchg_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.atomic_xchg_kernel.md.item { i32 32, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @atomic_xchg_kernel(ptr addrspace(1) %out, [8 x i32], [8 x i32], i32 %swap) sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @atomic_xchg_kernel(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], [8 x i32] [[TMP0:%.*]], [8 x i32] [[TMP1:%.*]], i32 [[SWAP:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT: br i1 [[TMP7]], label [[MALLOC:%.*]], label [[TMP20:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMIC_XCHG_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomic_xchg_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMIC_XCHG_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomic_xchg_kernel.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP13]])
+; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP15]], ptr addrspace(3) @llvm.amdgcn.sw.lds.atomic_xchg_kernel, align 8
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP15]], i64 8
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 24)
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP15]], i64 36
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28)
+; CHECK-NEXT: br label [[TMP20]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.atomic_xchg_kernel, align 8
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMIC_XCHG_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomic_xchg_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.atomic_xchg_kernel, i32 [[TMP22]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(3) [[TMP23]], i32 4
+; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP26]], 3
+; CHECK-NEXT: [[TMP59:%.*]] = inttoptr i64 [[TMP32]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP60:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP60]], 3
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 2147450880
+; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i8 [[TMP30]], 0
+; CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP60]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[TMP33]] to i8
+; CHECK-NEXT: [[TMP35:%.*]] = icmp sge i8 [[TMP34]], [[TMP30]]
+; CHECK-NEXT: [[TMP36:%.*]] = and i1 [[TMP31]], [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP36]])
+; CHECK-NEXT: [[TMP38:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[TMP38]], label [[ASAN_REPORT:%.*]], label [[TMP41:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP40:%.*]]
+; CHECK: 41:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP60]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP40]]
+; CHECK: 42:
+; CHECK-NEXT: br label [[TMP41]]
+; CHECK: 43:
+; CHECK-NEXT: [[TMP61:%.*]] = ptrtoint ptr addrspace(1) [[TMP59]] to i64
+; CHECK-NEXT: [[TMP62:%.*]] = lshr i64 [[TMP61]], 3
+; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP62]], 2147450880
+; CHECK-NEXT: [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr
+; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[TMP47]], align 1
+; CHECK-NEXT: [[TMP49:%.*]] = icmp ne i8 [[TMP48]], 0
+; CHECK-NEXT: [[TMP50:%.*]] = and i64 [[TMP61]], 7
+; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i8
+; CHECK-NEXT: [[TMP52:%.*]] = icmp sge i8 [[TMP51]], [[TMP48]]
+; CHECK-NEXT: [[TMP53:%.*]] = and i1 [[TMP49]], [[TMP52]]
+; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP53]])
+; CHECK-NEXT: [[TMP55:%.*]] = icmp ne i64 [[TMP54]], 0
+; CHECK-NEXT: br i1 [[TMP55]], label [[ASAN_REPORT1:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP53]], label [[TMP56:%.*]], label [[TMP57:%.*]]
+; CHECK: 56:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP61]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP57]]
+; CHECK: 57:
+; CHECK-NEXT: br label [[TMP58]]
+; CHECK: 58:
+; CHECK-NEXT: [[TMP42:%.*]] = cmpxchg ptr addrspace(1) [[TMP25]], i32 7, i32 [[SWAP]] seq_cst monotonic, align 4
+; CHECK-NEXT: [[RESULT:%.*]] = extractvalue { i32, i1 } [[TMP42]], 0
+; CHECK-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP43:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP44:%.*]] = ptrtoint ptr [[TMP43]] to i64
+; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP45]], i64 [[TMP44]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %gep = getelementptr i32, ptr addrspace(3) @lds_1, i32 4
+ %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR6]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
new file mode 100644
index 00000000000000..b87b3fd824dd3d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+ at lds_1 = internal addrspace(3) global [1 x i32] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 4
+
+; Test to check if static LDS accesses in kernel are lowered correctly.
+;.
+; CHECK: @llvm.amdgcn.sw.lds.atomicrmw_kernel = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.atomicrmw_kernel.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.atomicrmw_kernel.md.type { %llvm.amdgcn.sw.lds.atomicrmw_kernel.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.atomicrmw_kernel.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.atomicrmw_kernel.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @atomicrmw_kernel(ptr addrspace(1) %arg0) sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @atomicrmw_kernel(
+; CHECK-SAME: ptr addrspace(1) [[ARG0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP0]], [[TMP26]]
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP64]], [[TMP45]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP65]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP20:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMICRMW_KERNEL_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomicrmw_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMICRMW_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomicrmw_kernel.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.atomicrmw_kernel, align 8
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28)
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68
+; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28)
+; CHECK-NEXT: br label [[TMP20]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.atomicrmw_kernel, align 8
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMICRMW_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomicrmw_kernel.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.atomicrmw_kernel, i32 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_ATOMICRMW_KERNEL_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.atomicrmw_kernel.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.atomicrmw_kernel, i32 [[TMP24]]
+; CHECK-NEXT: [[TMP1:%.*]] = load volatile i32, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = ptrtoint ptr addrspace(1) [[TMP28]] to i64
+; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[TMP29]], 3
+; CHECK-NEXT: [[TMP98:%.*]] = inttoptr i64 [[TMP35]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP99:%.*]] = ptrtoint ptr addrspace(1) [[TMP28]] to i64
+; CHECK-NEXT: [[TMP30:%.*]] = lshr i64 [[TMP99]], 3
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], 2147450880
+; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = icmp ne i8 [[TMP33]], 0
+; CHECK-NEXT: [[TMP36:%.*]] = and i64 [[TMP99]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = trunc i64 [[TMP36]] to i8
+; CHECK-NEXT: [[TMP38:%.*]] = icmp sge i8 [[TMP37]], [[TMP33]]
+; CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP34]], [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP39]])
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne i64 [[TMP40]], 0
+; CHECK-NEXT: br i1 [[TMP41]], label [[ASAN_REPORT:%.*]], label [[TMP44:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report:
+; CHECK-NEXT: br i1 [[TMP39]], label [[TMP42:%.*]], label [[TMP43:%.*]]
+; CHECK: 44:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP99]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP43]]
+; CHECK: 45:
+; CHECK-NEXT: br label [[TMP44]]
+; CHECK: 46:
+; CHECK-NEXT: [[TMP100:%.*]] = ptrtoint ptr addrspace(1) [[TMP98]] to i64
+; CHECK-NEXT: [[TMP101:%.*]] = lshr i64 [[TMP100]], 3
+; CHECK-NEXT: [[TMP102:%.*]] = add i64 [[TMP101]], 2147450880
+; CHECK-NEXT: [[TMP103:%.*]] = inttoptr i64 [[TMP102]] to ptr
+; CHECK-NEXT: [[TMP104:%.*]] = load i8, ptr [[TMP103]], align 1
+; CHECK-NEXT: [[TMP105:%.*]] = icmp ne i8 [[TMP104]], 0
+; CHECK-NEXT: [[TMP106:%.*]] = and i64 [[TMP100]], 7
+; CHECK-NEXT: [[TMP54:%.*]] = trunc i64 [[TMP106]] to i8
+; CHECK-NEXT: [[TMP107:%.*]] = icmp sge i8 [[TMP54]], [[TMP104]]
+; CHECK-NEXT: [[TMP108:%.*]] = and i1 [[TMP105]], [[TMP107]]
+; CHECK-NEXT: [[TMP109:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP108]])
+; CHECK-NEXT: [[TMP110:%.*]] = icmp ne i64 [[TMP109]], 0
+; CHECK-NEXT: br i1 [[TMP110]], label [[ASAN_REPORT1:%.*]], label [[TMP111:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: br i1 [[TMP108]], label [[TMP112:%.*]], label [[TMP113:%.*]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP100]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP113]]
+; CHECK: 60:
+; CHECK-NEXT: br label [[TMP111]]
+; CHECK: 61:
+; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw umin ptr addrspace(1) [[TMP28]], i32 [[TMP1]] seq_cst, align 4
+; CHECK-NEXT: [[TMP46:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32
+; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = ptrtoint ptr addrspace(1) [[TMP47]] to i64
+; CHECK-NEXT: [[TMP114:%.*]] = add i64 [[TMP48]], 3
+; CHECK-NEXT: [[TMP115:%.*]] = inttoptr i64 [[TMP114]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP116:%.*]] = ptrtoint ptr addrspace(1) [[TMP47]] to i64
+; CHECK-NEXT: [[TMP49:%.*]] = lshr i64 [[TMP116]], 3
+; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[TMP49]], 2147450880
+; CHECK-NEXT: [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr
+; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr [[TMP51]], align 1
+; CHECK-NEXT: [[TMP53:%.*]] = icmp ne i8 [[TMP52]], 0
+; CHECK-NEXT: [[TMP55:%.*]] = and i64 [[TMP116]], 7
+; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i8
+; CHECK-NEXT: [[TMP57:%.*]] = icmp sge i8 [[TMP56]], [[TMP52]]
+; CHECK-NEXT: [[TMP58:%.*]] = and i1 [[TMP53]], [[TMP57]]
+; CHECK-NEXT: [[TMP59:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP58]])
+; CHECK-NEXT: [[TMP60:%.*]] = icmp ne i64 [[TMP59]], 0
+; CHECK-NEXT: br i1 [[TMP60]], label [[ASAN_REPORT2:%.*]], label [[TMP63:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: br i1 [[TMP58]], label [[TMP61:%.*]], label [[TMP62:%.*]]
+; CHECK: 80:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP116]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP62]]
+; CHECK: 81:
+; CHECK-NEXT: br label [[TMP63]]
+; CHECK: 82:
+; CHECK-NEXT: [[TMP117:%.*]] = ptrtoint ptr addrspace(1) [[TMP115]] to i64
+; CHECK-NEXT: [[TMP118:%.*]] = lshr i64 [[TMP117]], 3
+; CHECK-NEXT: [[TMP119:%.*]] = add i64 [[TMP118]], 2147450880
+; CHECK-NEXT: [[TMP120:%.*]] = inttoptr i64 [[TMP119]] to ptr
+; CHECK-NEXT: [[TMP87:%.*]] = load i8, ptr [[TMP120]], align 1
+; CHECK-NEXT: [[TMP88:%.*]] = icmp ne i8 [[TMP87]], 0
+; CHECK-NEXT: [[TMP89:%.*]] = and i64 [[TMP117]], 7
+; CHECK-NEXT: [[TMP90:%.*]] = trunc i64 [[TMP89]] to i8
+; CHECK-NEXT: [[TMP91:%.*]] = icmp sge i8 [[TMP90]], [[TMP87]]
+; CHECK-NEXT: [[TMP92:%.*]] = and i1 [[TMP88]], [[TMP91]]
+; CHECK-NEXT: [[TMP93:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP92]])
+; CHECK-NEXT: [[TMP94:%.*]] = icmp ne i64 [[TMP93]], 0
+; CHECK-NEXT: br i1 [[TMP94]], label [[ASAN_REPORT3:%.*]], label [[TMP97:%.*]], !prof [[PROF2]]
+; CHECK: asan.report3:
+; CHECK-NEXT: br i1 [[TMP92]], label [[TMP95:%.*]], label [[TMP96:%.*]]
+; CHECK: 95:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP117]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP96]]
+; CHECK: 96:
+; CHECK-NEXT: br label [[TMP97]]
+; CHECK: 97:
+; CHECK-NEXT: [[TMP3:%.*]] = atomicrmw umax ptr addrspace(1) [[TMP47]], i32 [[TMP1]] seq_cst, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP66:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP66]]
+; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64
+; CHECK-NEXT: [[TMP69:%.*]] = lshr i64 [[TMP68]], 3
+; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[TMP69]], 2147450880
+; CHECK-NEXT: [[TMP71:%.*]] = inttoptr i64 [[TMP70]] to ptr
+; CHECK-NEXT: [[TMP72:%.*]] = load i8, ptr [[TMP71]], align 1
+; CHECK-NEXT: [[TMP73:%.*]] = icmp ne i8 [[TMP72]], 0
+; CHECK-NEXT: [[TMP74:%.*]] = and i64 [[TMP68]], 7
+; CHECK-NEXT: [[TMP75:%.*]] = add i64 [[TMP74]], 3
+; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i8
+; CHECK-NEXT: [[TMP77:%.*]] = icmp sge i8 [[TMP76]], [[TMP72]]
+; CHECK-NEXT: [[TMP78:%.*]] = and i1 [[TMP73]], [[TMP77]]
+; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP78]])
+; CHECK-NEXT: [[TMP80:%.*]] = icmp ne i64 [[TMP79]], 0
+; CHECK-NEXT: br i1 [[TMP80]], label [[ASAN_REPORT4:%.*]], label [[TMP83:%.*]], !prof [[PROF2]]
+; CHECK: asan.report4:
+; CHECK-NEXT: br i1 [[TMP78]], label [[TMP81:%.*]], label [[TMP82:%.*]]
+; CHECK: 115:
+; CHECK-NEXT: call void @__asan_report_store4(i64 [[TMP68]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
+; CHECK-NEXT: br label [[TMP82]]
+; CHECK: 116:
+; CHECK-NEXT: br label [[TMP83]]
+; CHECK: 117:
+; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP67]], align 4
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP84:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP85:%.*]] = ptrtoint ptr [[TMP84]] to i64
+; CHECK-NEXT: [[TMP86:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP86]], i64 [[TMP85]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ %1 = load volatile i32, ptr addrspace(1) %arg0
+ %2 = atomicrmw umin ptr addrspace(3) @lds_1, i32 %1 seq_cst
+ %3 = atomicrmw umax ptr addrspace(3) @lds_1, i32 %1 seq_cst
+ %4 = add i32 %2, %3
+ store i32 %4, ptr addrspace(3) @lds_2, align 4
+ ret void
+}
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind }
+; CHECK: attributes #[[ATTR6]] = { nomerge }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
new file mode 100644
index 00000000000000..806a4aa70edcfa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test to check if static LDS accesses in kernel are lowered correctly.
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+ at lds_2 = internal addrspace(3) global [1 x i32] poison, align 8
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
+;.
+define amdgpu_kernel void @k0() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP23]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP15]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
+; CHECK-NEXT: [[TMP41:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP41]], i64 24)
+; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 33
+; CHECK-NEXT: [[TMP62:%.*]] = ptrtoint ptr addrspace(1) [[TMP61]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP62]], i64 31)
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 68
+; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr addrspace(1) [[TMP63]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP64]], i64 28)
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 20:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP10]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP17]]
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP26]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP27]], align 4
+; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr addrspace(3) [[TMP24]] to i32
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP43]]
+; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP44]], align 2
+; CHECK-NEXT: br label [[CONDFREE1:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ store i32 8, ptr addrspace(3) @lds_2, align 2
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+;.
More information about the llvm-commits
mailing list