[llvm] [AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds (PR #161827)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 3 08:30:25 PDT 2025
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/161827
>From 99a4a7d3bb6b46768e558bbc830574324d85d5a5 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 3 Oct 2025 13:25:55 +0530
Subject: [PATCH 1/2] [AMDGPU][ASAN] Handle special GVs in amdgpu-sw-lower-lds
---
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 133 ----------
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 138 ++++++++++
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h | 16 ++
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 31 ++-
...w-lower-lds-named-barrier-and-other-lds.ll | 241 ++++++++++++++++++
.../amdgpu-sw-lower-lds-named-barrier.ll | 113 ++++++++
6 files changed, 525 insertions(+), 147 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..3036b992bdcc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -511,19 +511,6 @@ class AMDGPULowerModuleLDS {
return MostUsed.GV;
}
- static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
- uint32_t Address) {
- // Write the specified address into metadata where it can be retrieved by
- // the assembler. Format is a half open range, [Address Address+1)
- LLVMContext &Ctx = M->getContext();
- auto *IntTy =
- M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
- auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
- auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
- GV->setMetadata(LLVMContext::MD_absolute_symbol,
- MDNode::get(Ctx, {MinC, MaxC}));
- }
-
DenseMap<Function *, Value *> tableKernelIndexCache;
Value *getTableLookupKernelIndex(Module &M, Function *F) {
// Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
@@ -922,126 +909,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
- bool NeedsReplacement = false;
- for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
- }
- }
- }
- if (!NeedsReplacement)
- return GV;
- // Create a new GV used only by this kernel and its function
- GlobalVariable *NewGV = new GlobalVariable(
- M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
- GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
- GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
- NewGV->copyAttributesFrom(GV);
- for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- Function *F = I->getFunction();
- if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
- }
- }
- }
- return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
- bool Changed = false;
- const DataLayout &DL = M.getDataLayout();
- // The 1st round: give module-absolute assignments
- int NumAbsolutes = 0;
- std::vector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
- OrderedGVs.push_back(GV);
- } else {
- // leave it to the 2nd round, which will give a kernel-relative
- // assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
- }
- OrderedGVs.clear();
-
- // The 2nd round: give a kernel-relative assignment for GV that
- // either only indirectly accessed by single kernel or only directly
- // accessed by multiple kernels.
- std::vector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
- }
- OrderedKernels = sortByName(std::move(OrderedKernels));
-
- llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
- for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
- if (!isNamedBarrier(*GV))
- continue;
-
- LDSUsesInfo.direct_access[F].erase(GV);
- if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
- }
- OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
- // GV could also be used directly by other kernels. If so, we need to
- // create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
- Changed |= (NewGV != GV);
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = Kernel2BarId[F];
- BarId += NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- Kernel2BarId[F] += BarCnt;
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
- }
- OrderedGVs.clear();
- }
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.indirect_access) {
- assert(isKernelLDS(K.first));
- for (GlobalVariable *GV : K.second) {
- if (isNamedBarrier(*GV))
- K.second.erase(GV);
- }
- }
- return Changed;
- }
-
bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c2113ca398..0a204d2bd0021 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -439,4 +439,142 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
return false;
}
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+ llvm::sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) {
+ // Write the specified address into metadata where it can be retrieved by
+ // the assembler. Format is a half open range, [Address Address+1)
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ std::vector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ std::vector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernelLDS(K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
} // end namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e74452573c..f867c695fa77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -71,6 +71,22 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
AAResults *AA);
+/// Create a new global variable in \p M based on \p GV, but uniquified for
+/// \p KF. The new global variable will have the same properties as \p GV, but
+/// will have a name based on \p GV and \p KF to ensure uniqueness.
+GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF);
+
+/// Record the absolute address of \p GV in the module flag metadata of \p M.
+void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address);
+
+/// Lower special LDS variables i.e named barriers in \p M.
+/// Update \p LDSUsesInfo and \p LDSToKernelsThatNeedToAccessItIndirectly
+/// to reflect any changes made.
+bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly);
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b37aa39..c4f37992b6cee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -304,18 +304,6 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
}
}
-static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
- uint32_t Address) {
- // Write the specified address into metadata where it can be retrieved by
- // the assembler. Format is a half open range, [Address Address+1)
- LLVMContext &Ctx = M.getContext();
- auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
- MDBuilder MDB(Ctx);
- MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
- ConstantInt::get(IntTy, Address + 1));
- GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
-}
-
static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
bool IsDynLDS) {
if (Offset != 0) {
@@ -378,10 +366,10 @@ void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
bool IsDynLDSUsed = LDSParams.SwDynLDS;
uint32_t Offset = LDSParams.LDSSize;
- recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
+ recordLDSAbsoluteAddress(&M, LDSParams.SwLDS, 0);
addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
if (LDSParams.SwDynLDS)
- recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
+ recordLDSAbsoluteAddress(&M, LDSParams.SwDynLDS, Offset);
}
void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
@@ -1161,6 +1149,21 @@ bool AMDGPUSwLowerLDS::run() {
if (!LowerAllLDS)
return Changed;
+ // Lower special LDS variables like named barriers.
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+ Changed |= lowerSpecialLDSVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+
// Utility to group LDS access into direct, indirect, static and dynamic.
auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
bool DirectAccess) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
new file mode 100644
index 0000000000000..357f034ef447c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
+; amdgpu-sw-lower-lds pass.
+ at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.kernel1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel1.md.type { %llvm.amdgcn.sw.lds.kernel1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel1.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META3]]
+; @llvm.amdgcn.sw.lds.kernel2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel2.md.type { %llvm.amdgcn.sw.lds.kernel2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel2.md.item { i32 32, i32 1, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [2 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [2 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 0)], [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
+define void @func1() {
+; CHECK-LABEL: define void @func1() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x [1 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]]
+; CHECK-NEXT: store i8 1, ptr addrspace(1) [[TMP10]], align 4
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ store i8 1, ptr addrspace(3) @lds_1, align 4
+ ret void
+}
+
+define void @func2() {
+; CHECK-LABEL: define void @func2() {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x [1 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]]
+; CHECK-NEXT: store i8 2, ptr addrspace(1) [[TMP10]], align 4
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ store i8 2, ptr addrspace(3) @lds_1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK: Function Attrs: nounwind sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @kernel1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] {
+; CHECK-NEXT: [[WID:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK: [[MALLOC]]:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, align 8
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31)
+; CHECK-NEXT: br label %[[BB18]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID:.*]] ], [ true, %[[MALLOC:.*]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel1.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel1, i32 [[TMP20]]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @func1()
+; CHECK-NEXT: call void @func2()
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]]
+; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP23]], align 4
+; CHECK-NEXT: br label %[[CONDFREE:.*]]
+; CHECK: [[CONDFREE]]:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]]
+; CHECK: [[FREE]]:
+; CHECK-NEXT: [[TMP24:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[TMP24]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP26]], i64 [[TMP25]])
+; CHECK-NEXT: br label %[[END]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ store i8 7, ptr addrspace(3) @lds_1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK: Function Attrs: nounwind sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @kernel2(
+; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] {
+; CHECK-NEXT: [[WID:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK: [[MALLOC]]:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2, align 8
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
+; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31)
+; CHECK-NEXT: br label %[[BB18]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID:.*]] ], [ true, %[[MALLOC:.*]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2, align 8
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel2.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel2, i32 [[TMP20]]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: call void @func2()
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]]
+; CHECK-NEXT: store i8 8, ptr addrspace(1) [[TMP23]], align 4
+; CHECK-NEXT: br label %[[CONDFREE:.*]]
+; CHECK: [[CONDFREE]]:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]]
+; CHECK: [[FREE]]:
+; CHECK-NEXT: [[TMP24:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[TMP24]] to i64
+; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP26]], i64 [[TMP25]])
+; CHECK-NEXT: br label %[[END]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ store i8 8, ptr addrspace(3) @lds_1, align 4
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind sanitize_address }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META1]] = !{i32 8396848, i32 8396849}
+; CHECK: [[META2]] = !{i32 8396832, i32 8396833}
+; CHECK: [[META3]] = !{i32 0, i32 1}
+; CHECK: [[META4:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META5]] = !{i32 0}
+; CHECK: [[META6]] = !{i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier.ll
new file mode 100644
index 0000000000000..6601fbc97dfc9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; This test checks that named barriers in LDS are lowered correctly by the
+; amdgpu-sw-lower-lds pass.
+ at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META2]]
+;.
+define void @func1() {
+; CHECK-LABEL: define void @func1() {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+; CHECK-LABEL: define void @func2() {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK: Function Attrs: nounwind sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @kernel1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @func1()
+; CHECK-NEXT: call void @func2()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK: Function Attrs: nounwind sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @kernel2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT: call void @func2()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind sanitize_address }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
+;.
+; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META1]] = !{i32 8396848, i32 8396849}
+; CHECK: [[META2]] = !{i32 8396832, i32 8396833}
+; CHECK: [[META3:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+;.
>From 2ae4c6e9d1c655f44ac68c2850f370d0848d5872 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 3 Oct 2025 20:48:17 +0530
Subject: [PATCH 2/2] Fix typo in test
---
.../AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
index 357f034ef447c..970f101ae5bbe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-named-barrier-and-other-lds.ll
@@ -2,7 +2,7 @@
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
-; This test checks that named barriers in LDS and other LDS in moduleare lowered correctly by the
+; This test checks that named barriers in LDS and other LDS in module are lowered correctly by the
; amdgpu-sw-lower-lds pass.
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
More information about the llvm-commits
mailing list