[llvm] e8ad2a0 - [amdgpu][nfc] Comment and extract two functions in LowerModuleLDS
Jon Chesterfield via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 21 16:41:12 PDT 2023
Author: Jon Chesterfield
Date: 2023-03-21T23:39:20Z
New Revision: e8ad2a051c1621032d15973877891c7296603d8b
URL: https://github.com/llvm/llvm-project/commit/e8ad2a051c1621032d15973877891c7296603d8b
DIFF: https://github.com/llvm/llvm-project/commit/e8ad2a051c1621032d15973877891c7296603d8b.diff
LOG: [amdgpu][nfc] Comment and extract two functions in LowerModuleLDS
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index d44e280640ee0..455d76b0cecde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -218,6 +218,13 @@ class AMDGPULowerModuleLDS : public ModulePass {
// llvm.donothing that takes a pointer to the instance and is lowered to a
// no-op after LDS is allocated, but that is not presently necessary.
+ // This intrinsic is eliminated shortly before instruction selection. It
+ // does not suffice to indicate to ISel that a given global which is not
+ // immediately used by the kernel must still be allocated by it. An
+ // equivalent target specific intrinsic which lasts until immediately after
+ // codegen would suffice for that, but one would still need to ensure that
+ // the variables are allocated in the anticpated order.
+
LLVMContext &Ctx = Func->getContext();
Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
@@ -241,7 +248,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
// This pass specialises LDS variables with respect to the kernel that
// allocates them.
- // This is semantically equivalent to:
+ // This is semantically equivalent to (the unimplemented as slow):
// for (auto &F : M.functions())
// for (auto &BB : F)
// for (auto &I : BB)
@@ -469,28 +476,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
IRBuilder<> Builder(Ctx);
Type *I32 = Type::getInt32Ty(Ctx);
- // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
- // lowers to a read from a live in register. Emit it once in the entry
- // block to spare deduplicating it later.
-
- DenseMap<Function *, Value *> tableKernelIndexCache;
- auto getTableKernelIndex = [&](Function *F) -> Value * {
- if (tableKernelIndexCache.count(F) == 0) {
- LLVMContext &Ctx = M.getContext();
- FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
- Function *Decl =
- Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
-
- BasicBlock::iterator it =
- F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
- Instruction &i = *it;
- Builder.SetInsertPoint(&i);
-
- tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
- }
-
- return tableKernelIndexCache[F];
- };
for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
auto *GV = ModuleScopeVariables[Index];
@@ -500,7 +485,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
if (!I)
continue;
- Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
+ Value *tableKernelIndex =
+ getTableLookupKernelIndex(M, I->getFunction());
// So if the phi uses this value multiple times, what does this look
// like?
@@ -517,6 +503,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
ConstantInt::get(I32, Index),
};
+
Value *Address = Builder.CreateInBoundsGEP(
LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
@@ -621,6 +608,78 @@ class AMDGPULowerModuleLDS : public ModulePass {
MDNode::get(Ctx, {MinC, MaxC}));
}
+ DenseMap<Function *, Value *> tableKernelIndexCache;
+ Value *getTableLookupKernelIndex(Module &M, Function *F) {
+ // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
+ // lowers to a read from a live in register. Emit it once in the entry
+ // block to spare deduplicating it later.
+ if (tableKernelIndexCache.count(F) == 0) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+ FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
+ Function *Decl =
+ Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+
+ BasicBlock::iterator it =
+ F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+ Instruction &i = *it;
+ Builder.SetInsertPoint(&i);
+
+ tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
+ }
+
+ return tableKernelIndexCache[F];
+ }
+
+ std::vector<Function *> assignLDSKernelIDToEachKernel(
+ Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS) {
+ // Associate kernels in the set with an arbirary but reproducible order and
+ // annotate them with that order in metadata. This metadata is recognised by
+ // the backend and lowered to a SGPR which can be read from using
+ // amdgcn_lds_kernel_id.
+
+ std::vector<Function *> OrderedKernels;
+
+ for (Function &Func : M->functions()) {
+ if (Func.isDeclaration())
+ continue;
+ if (!isKernelLDS(&Func))
+ continue;
+
+ if (KernelsThatAllocateTableLDS.contains(&Func)) {
+ assert(Func.hasName()); // else fatal error earlier
+ OrderedKernels.push_back(&Func);
+ }
+ }
+
+ // Put them in an arbitrary but reproducible order
+ llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
+ [](const Function *lhs, const Function *rhs) -> bool {
+ return lhs->getName() < rhs->getName();
+ });
+
+ // Annotate the kernels with their order in this vector
+ LLVMContext &Ctx = M->getContext();
+ IRBuilder<> Builder(Ctx);
+
+ if (OrderedKernels.size() > UINT32_MAX) {
+ // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+ report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+ }
+
+ for (size_t i = 0; i < OrderedKernels.size(); i++) {
+ Metadata *AttrMDArgs[1] = {
+ ConstantAsMetadata::get(Builder.getInt32(i)),
+ };
+ OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
+ MDNode::get(Ctx, AttrMDArgs));
+
+ }
+
+
+ return OrderedKernels;
+ }
+
bool runOnModule(Module &M) override {
LLVMContext &Ctx = M.getContext();
CallGraph CG = CallGraph(M);
@@ -644,7 +703,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
}
- // Partition variables into the
diff erent strategies
+ // Partition variables accessed indirectly into the
diff erent strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
DenseSet<GlobalVariable *> KernelAccessVariables;
@@ -706,10 +765,12 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
}
+ // All LDS variables accessed indirectly have now been partitioned into
+ // the distinct lowering strategies.
assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
KernelAccessVariables.size() ==
LDSToKernelsThatNeedToAccessItIndirectly.size());
- } // Variables have now been partitioned into the three lowering strategies.
+ }
// If the kernel accesses a variable that is going to be stored in the
// module instance through a call then that kernel needs to allocate the
@@ -787,9 +848,14 @@ class AMDGPULowerModuleLDS : public ModulePass {
continue;
DenseSet<GlobalVariable *> KernelUsedVariables;
+ // Allocating variables that are used directly in this struct to get
+ // alignment aware allocation and predictable frame size.
for (auto &v : LDSUsesInfo.direct_access[&Func]) {
KernelUsedVariables.insert(v);
}
+
+ // Allocating variables that are accessed indirectly so that a lookup of
+ // this struct instance can find them from nested functions.
for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
KernelUsedVariables.insert(v);
}
@@ -803,7 +869,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
if (KernelUsedVariables.empty()) {
- // Either used no LDS, or all the LDS it used was also in module
+ // Either used no LDS, or the LDS it used was all in the module struct
continue;
}
@@ -872,53 +938,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
DenseSet<GlobalVariable *> Vec;
Vec.insert(GV);
+ // TODO: Looks like a latent bug, Replacement may not be marked
+ // UsedByKernel here
replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
return isa<Instruction>(U.getUser());
});
}
if (!KernelsThatAllocateTableLDS.empty()) {
- // Collect the kernels that allocate table lookup LDS
- std::vector<Function *> OrderedKernels;
- {
- for (Function &Func : M.functions()) {
- if (Func.isDeclaration())
- continue;
- if (!isKernelLDS(&Func))
- continue;
-
- if (KernelsThatAllocateTableLDS.contains(&Func)) {
- assert(Func.hasName()); // else fatal error earlier
- OrderedKernels.push_back(&Func);
- }
- }
-
- // Put them in an arbitrary but reproducible order
- llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
- [](const Function *lhs, const Function *rhs) -> bool {
- return lhs->getName() < rhs->getName();
- });
-
- // Annotate the kernels with their order in this vector
LLVMContext &Ctx = M.getContext();
IRBuilder<> Builder(Ctx);
- if (OrderedKernels.size() > UINT32_MAX) {
- // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
- report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
- }
-
- for (size_t i = 0; i < OrderedKernels.size(); i++) {
- Metadata *AttrMDArgs[1] = {
- ConstantAsMetadata::get(Builder.getInt32(i)),
- };
- OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
- MDNode::get(Ctx, AttrMDArgs));
+ // The ith element of this vector is kernel id i
+ std::vector<Function *> OrderedKernels =
+ assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS);
- markUsedByKernel(Builder, OrderedKernels[i],
- KernelToReplacement[OrderedKernels[i]].SGV);
- }
- }
+ for (size_t i = 0; i < OrderedKernels.size(); i++) {
+ markUsedByKernel(Builder, OrderedKernels[i],
+ KernelToReplacement[OrderedKernels[i]].SGV);
+ }
// The order must be consistent between lookup table and accesses to
// lookup table
@@ -938,7 +976,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
for (auto &GV : make_early_inc_range(M.globals()))
if (AMDGPU::isLDSVariableToLower(GV)) {
-
// probably want to remove from used lists
GV.removeDeadConstantUsers();
if (GV.use_empty())
More information about the llvm-commits
mailing list