[llvm] [llvm][amdgpu] Handle indirect refs to LDS GVs during LDS lowering (PR #124089)
Kareem Ergawy via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 01:16:39 PST 2025
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/124089
>From 3087adbcf491096d89909ecbfd453c25e0f57e28 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Thu, 23 Jan 2025 02:37:18 -0600
Subject: [PATCH] [llvm][amdgpu] Handle indirect refs to LDS GVs during LDS
lowering
Fixes #123800
Extends LDS lowering by allowing it to discover transitive
indirect/escpaing references to LDS GVs.
For example, given the following input:
```llvm
@lds_item_to_indirectly_load = internal addrspace(3) global ptr undef, align 8
%store_type = type { i32, ptr }
@place_to_store_indirect_caller = internal addrspace(3) global %store_type undef, align 8
define amdgpu_kernel void @offloading_kernel() {
store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8
call void @call_unknown()
ret void
}
define void @call_unknown() {
%1 = alloca ptr, align 8
%2 = call i32 %1()
ret void
}
define void @indirectly_load_lds() {
call void @directly_load_lds()
ret void
}
define void @directly_load_lds() {
%2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8
ret void
}
```
With the above input, prior to this patch, LDS lowering failed to lower
the reference to `@lds_item_to_indirectly_load` because:
1. it is indirectly called by a function whose address is taken in the
kernel.
2. we did not check if the kernel indirectly makes any calls to unknown
functions (we only checked the direct calls).
Co-authored-by: Jon Chesterfield <jonathan.chesterfield at amd.com>
---
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 52 ++++++++++++++++---
.../AMDGPU/lower-indirect-lds-references.ll | 44 ++++++++++++++++
.../AMDGPU/remove-no-kernel-id-attribute.ll | 2 +-
3 files changed, 90 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 0406ba9c68ccd3..a5bfdb7bf6eacd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -141,8 +141,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap DirectMapFunction;
getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
- // Collect variables that are used by functions whose address has escaped
- DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+ // Collect functions whose address has escaped
+ DenseSet<Function *> AddressTakenFuncs;
for (Function &F : M.functions()) {
if (!isKernelLDS(&F))
if (F.hasAddressTaken(nullptr,
@@ -150,11 +150,16 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
/* IgnoreAssumeLikeCalls */ false,
/* IgnoreLLVMUsed */ true,
/* IgnoreArcAttachedCall */ false)) {
- set_union(VariablesReachableThroughFunctionPointer,
- DirectMapFunction[&F]);
+ AddressTakenFuncs.insert(&F);
}
}
+ // Collect variables that are used by functions whose address has escaped
+ DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+ for (Function *F : AddressTakenFuncs) {
+ set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]);
+ }
+
auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
assert(!F->isDeclaration());
for (const CallGraphNode::CallRecord &R : *CG[F]) {
@@ -206,6 +211,13 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
}
}
+ // Collect variables that are transitively used by functions whose address has
+ // escaped
+ for (Function *F : AddressTakenFuncs) {
+ set_union(VariablesReachableThroughFunctionPointer,
+ TransitiveMapFunction[F]);
+ }
+
// DirectMapKernel lists which variables are used by the kernel
// find the variables which are used through a function call
FunctionVariableMap IndirectMapKernel;
@@ -218,11 +230,37 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
Function *Ith = R.second->getFunction();
if (Ith) {
set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
- } else {
- set_union(IndirectMapKernel[&Func],
- VariablesReachableThroughFunctionPointer);
}
}
+
+ // Check if the kernel encounters unknows calls, wheher directly or
+ // indirectly.
+ bool SeesUnknownCalls = [&]() {
+ SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
+ SmallPtrSet<Function *, 8> Visited;
+
+ while (!WorkList.empty()) {
+ Function *F = WorkList.pop_back_val();
+
+ for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
+ if (!CallRecord.second)
+ continue;
+
+ Function *Callee = CallRecord.second->getFunction();
+ if (!Callee)
+ return true;
+
+ if (Visited.insert(Callee).second)
+ WorkList.push_back(Callee);
+ }
+ }
+ return false;
+ }();
+
+ if (SeesUnknownCalls) {
+ set_union(IndirectMapKernel[&Func],
+ VariablesReachableThroughFunctionPointer);
+ }
}
// Verify that we fall into one of 2 cases:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
new file mode 100644
index 00000000000000..f14f8d071af564
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
@@ -0,0 +1,44 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Tests that the LDS lowering pass handles indirect references to LDS GVs; i.e.
+; that it lowers to accesses into the generated LDS struct if these references
+; are deep in the call graph starting at the kernel.
+
+ at lds_item_to_indirectly_load = internal addrspace(3) global ptr undef, align 8
+
+%store_type = type { i32, ptr }
+ at place_to_store_indirect_caller = internal addrspace(3) global %store_type undef, align 8
+
+define amdgpu_kernel void @offloading_kernel() {
+ store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8
+ call void @call_unknown()
+ ret void
+}
+
+define void @call_unknown() {
+ %1 = alloca ptr, align 8
+ %2 = call i32 %1()
+ ret void
+}
+
+define void @indirectly_load_lds() {
+ call void @directly_load_lds()
+ ret void
+}
+
+define void @directly_load_lds() {
+ %2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8
+ ret void
+}
+
+; CHECK: %[[LDS_STRUCT_TY:.*]] = type { %store_type, ptr }
+; CHECK: @[[LDS_STRUCT:.*]] = {{.*}} %[[LDS_STRUCT_TY]] {{.*}} !absolute_symbol
+
+; CHECK: define amdgpu_kernel void @offloading_kernel() {{.*}} {
+; CHECK: store ptr @indirectly_load_lds, {{.*}} @[[LDS_STRUCT]]
+; CHECK: call void @call_unknown()
+; CHECK: }
+
+; CHECK: define void @directly_load_lds() {
+; CHECK: load ptr, {{.*}} (%[[LDS_STRUCT_TY]], {{.*}} @[[LDS_STRUCT]], i32 0, i32 1)
+; CHECK: }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 2850612d700817..1765bd1cfb0086 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -196,7 +196,7 @@ define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
More information about the llvm-commits
mailing list