[llvm] [AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS (PR #165692)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 5 01:51:50 PST 2025
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/165692
>From e380c9dce13100b377a40e9d6570ceeb29d3002a Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 30 Oct 2025 14:51:45 +0530
Subject: [PATCH 1/3] [AMDGPU] Add amdgpu-lower-special-lds pass to lower
named-barrier LDS
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +
.../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 231 ++++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/amdgpu-lower-special-lds.ll | 67 +++++
6 files changed, 310 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b2495a4250..d878cbfce07f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
bool GlobalOpt;
};
+void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerSpecialLDSLegacyPassID;
+ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+
+struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
+ AMDGPULowerSpecialLDSPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
extern char &AMDGPUSwLowerLDSLegacyPassID;
ModulePass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
new file mode 100644
index 0000000000000..56161dacc49e7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -0,0 +1,231 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs
+// special address assignment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-special-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ // Write the specified address into metadata where it can be retrieved by
+ // the assembler. Format is a half open range, [Address Address+1)
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+ llvm::sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ std::vector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ std::vector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernelLDS(K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
+bool runLowerSpecialLDS(Module &M) {
+ CallGraph CG = CallGraph(M);
+ bool Changed = false;
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerSpecialLDSVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+ return Changed;
+}
+
+class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+} // namespace
+
+char AMDGPULowerSpecialLDSLegacy::ID = 0;
+char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of special LDS variables", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of special LDS variables", false, false)
+
+bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
+ return runLowerSpecialLDS(M);
+}
+
+ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
+ return new AMDGPULowerSpecialLDSLegacy();
+}
+
+PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..a2fd53ac1b8ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
#undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 75a94ac891819..916826ea169aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..c401926e22a87 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
AMDGPUPrepareAGPRAlloc.cpp
+ AMDGPULowerSpecialLDS.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
new file mode 100644
index 0000000000000..28d94f3d42622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+ at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+
+define void @func1() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}
>From b8f7658ed345674560f9aca73b50ada60dfb43f3 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Mon, 3 Nov 2025 12:43:02 +0530
Subject: [PATCH 2/3] Add comments
---
llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
index 56161dacc49e7..5534a3ba6382e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -18,7 +18,6 @@
#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
@@ -33,6 +32,8 @@ using namespace AMDGPU;
namespace {
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
Function *KF) {
bool NeedsReplacement = false;
@@ -64,10 +65,10 @@ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
return NewGV;
}
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
uint32_t Address) {
- // Write the specified address into metadata where it can be retrieved by
- // the assembler. Format is a half open range, [Address Address+1)
LLVMContext &Ctx = M->getContext();
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -83,7 +84,8 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
return {std::move(V)};
}
-bool lowerSpecialLDSVariables(
+// Main utility function for special LDS variables lowering.
+static bool lowerSpecialLDSVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
bool Changed = false;
@@ -172,7 +174,7 @@ bool lowerSpecialLDSVariables(
return Changed;
}
-bool runLowerSpecialLDS(Module &M) {
+static bool runLowerSpecialLDS(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = false;
Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
@@ -205,6 +207,7 @@ class AMDGPULowerSpecialLDSLegacy : public ModulePass {
AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
bool runOnModule(Module &M) override;
};
+
} // namespace
char AMDGPULowerSpecialLDSLegacy::ID = 0;
>From 63646d8e24b03441a0dd8fc627e7d6ea406fe430 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Wed, 5 Nov 2025 15:19:31 +0530
Subject: [PATCH 3/3] Elaborate description and namespace changes
---
.../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 22 +++++++++++--------
1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
index 5534a3ba6382e..ae869a841f7f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -6,8 +6,12 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass lowers the named barriers LDS globals which needs
-// special address assignment.
+// This pass performs lowering of LDS global variables with target extension
+// type "amdgpu.named.barrier" that require specialized address assignment. It
+// assigns a unique barrier identifier to each named-barrier LDS variable and
+// encodes this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
//
//===----------------------------------------------------------------------===//
@@ -77,8 +81,8 @@ static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
MDNode::get(Ctx, {MinC, MaxC}));
}
-template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
- llvm::sort(V, [](const auto *L, const auto *R) {
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+ sort(V, [](const auto *L, const auto *R) {
return L->getName() < R->getName();
});
return {std::move(V)};
@@ -92,7 +96,7 @@ static bool lowerSpecialLDSVariables(
const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
- std::vector<GlobalVariable *> OrderedGVs;
+ SmallVector<GlobalVariable *> OrderedGVs;
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
GlobalVariable *GV = K.first;
if (!isNamedBarrier(*GV))
@@ -111,7 +115,7 @@ static bool lowerSpecialLDSVariables(
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
NumAbsolutes += BarCnt;
@@ -126,7 +130,7 @@ static bool lowerSpecialLDSVariables(
// The 2nd round: give a kernel-relative assignment for GV that
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
- std::vector<Function *> OrderedKernels;
+ SmallVector<Function *> OrderedKernels;
for (auto &K : LDSUsesInfo.direct_access) {
Function *F = K.first;
assert(isKernelLDS(F));
@@ -134,7 +138,7 @@ static bool lowerSpecialLDSVariables(
}
OrderedKernels = sortByName(std::move(OrderedKernels));
- llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ DenseMap<Function *, uint32_t> Kernel2BarId;
for (Function *F : OrderedKernels) {
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
if (!isNamedBarrier(*GV))
@@ -153,7 +157,7 @@ static bool lowerSpecialLDSVariables(
// create a new GV used only by this kernel and its function.
auto NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = Kernel2BarId[F];
BarId += NumAbsolutes + 1;
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
More information about the llvm-commits
mailing list