[llvm] [AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5 (PR #65273)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 28 07:02:35 PST 2023
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/65273
>From be26aabf7246391821d6250f000fda4f7308423c Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Mon, 4 Sep 2023 20:16:51 +0530
Subject: [PATCH 1/3] [AMDGPU] Add dynamic LDS size implicit kernel argument to
CO-v5
hidden_dynamic_lds_size argument will be added in the reserved
section at offset 120 of the implicit argument layout.
---
llvm/docs/AMDGPUUsage.rst | 3 +++
llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 10 +++++++++-
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 6 ++++++
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h | 7 +++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 +
.../test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll | 7 ++++++-
7 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d49d1cd3812512..51f4b23bab4927 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4113,6 +4113,9 @@ Code object V5 metadata is the same as
buffer that conforms to the requirements of the malloc/free
device library V1 version implementation.
+ "hidden_dynamic_lds_size"
+ Size of the dynamically allocated LDS memory is passed in the kernarg.
+
"hidden_private_base"
The high 32 bits of the flat addressing private aperture base.
Only used by GFX8 to allow conversion between private segment
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index dda3380c04ea9b..33eed07c46292f 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
.Case("hidden_default_queue", true)
.Case("hidden_completion_action", true)
.Case("hidden_multigrid_sync_arg", true)
+ .Case("hidden_dynamic_lds_size", true)
.Case("hidden_private_base", true)
.Case("hidden_shared_base", true)
.Case("hidden_queue_ptr", true)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b..74e9cd7d09654c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}
- Offset += 72; // Reserved.
+ // Emit argument for hidden dynamic lds size
+ if (MFI.isDynamicLDSUsed()) {
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+ Args);
+ } else {
+ Offset += 4; // skipped
+ }
+
+ Offset += 68; // Reserved.
// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 323462e60a29fa..eb31a32933af24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -210,3 +210,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
}
}
}
+
+void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
+ UsesDynamicLDS = DynLDS;
+}
+
+bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 248ee26a47eb1d..7efb7f825348e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
/// stages.
Align DynLDSAlign;
+ // Flag to check dynamic LDS usage by kernel.
+ bool UsesDynamicLDS = false;
+
// Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
Align getDynLDSAlign() const { return DynLDSAlign; }
void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
+
+ void setUsesDynamicLDS(bool DynLDS);
+
+ bool isDynamicLDSUsed() const;
};
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b43e..8094060e9a59eb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6892,6 +6892,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
// Adjust alignment for that dynamic shared memory array.
Function &F = DAG.getMachineFunction().getFunction();
MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+ MFI->setUsesDynamicLDS(true);
return SDValue(
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
index cb30d668674c31..1a2ce636c733c5 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
@@ -81,13 +81,16 @@
; CHECK-NEXT: - .offset: 136
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_completion_action
+; CHECK: - .offset: 144
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
; GFX8-NEXT: - .offset: 216
; GFX8-NEXT: .size: 4
; GFX8-NEXT: .value_kind: hidden_private_base
; GFX8-NEXT: - .offset: 220
; GFX8-NEXT: .size: 4
; GFX8-NEXT: .value_kind: hidden_shared_base
-; CHECK: - .offset: 224
+; CHECK: - .offset: 224
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_queue_ptr
@@ -97,6 +100,7 @@
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
define amdgpu_kernel void @test_v5(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
@@ -106,6 +110,7 @@ entry:
%b.val = load half, ptr addrspace(1) %b
%r.val = fadd half %a.val, %b.val
store half %r.val, ptr addrspace(1) %r
+ store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
ret void
}
>From 2f46c674d169044b74ac6a60337490a6c4828d12 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Tue, 26 Dec 2023 22:59:15 +0530
Subject: [PATCH 2/3] [AMDGPU] Update UsesDynamicLDS when kernel calls as
function which uses dyn lds
---
.../Target/AMDGPU/AMDGPUMachineFunction.cpp | 22 ++--
...hsa-metadata-dynlds-func-hidden-args-v5.ll | 124 ++++++++++++++++++
...-metadata-dynlds-funcarg-hidden-args-v5.ll | 124 ++++++++++++++++++
3 files changed, 261 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index eb31a32933af24..9e1e4c39915788 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,6 +19,15 @@
using namespace llvm;
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+ const Module *M = F.getParent();
+ std::string KernelDynLDSName = "llvm.amdgcn.";
+ KernelDynLDSName += F.getName();
+ KernelDynLDSName += ".dynlds";
+ return M->getNamedGlobal(KernelDynLDSName);
+}
+
AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
const AMDGPUSubtarget &ST)
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +74,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
NoSignedZerosFPMath =
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
+
+ const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
+ if (DynLdsGlobal)
+ UsesDynamicLDS = true;
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +152,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
-static const GlobalVariable *
-getKernelDynLDSGlobalFromFunction(const Function &F) {
- const Module *M = F.getParent();
- std::string KernelDynLDSName = "llvm.amdgcn.";
- KernelDynLDSName += F.getName();
- KernelDynLDSName += ".dynlds";
- return M->getNamedGlobal(KernelDynLDSName);
-}
-
std::optional<uint32_t>
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
new file mode 100644
index 00000000000000..cb15ff9fcb1bce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK: amdhsa.kernels:
+; CHECK-NEXT: - .args:
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: r
+; CHECK-NEXT: .offset: 0
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: a
+; CHECK-NEXT: .offset: 8
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: b
+; CHECK-NEXT: .offset: 16
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .offset: 24
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_x
+; CHECK-NEXT: - .offset: 28
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_y
+; CHECK-NEXT: - .offset: 32
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_z
+; CHECK-NEXT: - .offset: 36
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_x
+; CHECK-NEXT: - .offset: 38
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_y
+; CHECK-NEXT: - .offset: 40
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_z
+; CHECK-NEXT: - .offset: 42
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_x
+; CHECK-NEXT: - .offset: 44
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_y
+; CHECK-NEXT: - .offset: 46
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_z
+; CHECK-NEXT: - .offset: 64
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_x
+; CHECK-NEXT: - .offset: 72
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_y
+; CHECK-NEXT: - .offset: 80
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_z
+; CHECK-NEXT: - .offset: 88
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_grid_dims
+; CHECK-NEXT: - .offset: 96
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_printf_buffer
+; CHECK-NEXT: - .offset: 104
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
+; CHECK-NEXT: - .offset: 112
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
+; CHECK-NEXT: - .offset: 120
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_heap_v1
+; CHECK-NEXT: - .offset: 128
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_default_queue
+; CHECK-NEXT: - .offset: 136
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_completion_action
+; CHECK: - .offset: 144
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
+; CHECK: - .offset: 224
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_queue_ptr
+
+; CHECK: .name: test_v5
+; CHECK: .symbol: test_v5.kd
+
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds() {
+ store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_v5(
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a,
+ ptr addrspace(1) %b) #0 {
+entry:
+ %a.val = load half, ptr addrspace(1) %a
+ %b.val = load half, ptr addrspace(1) %b
+ %r.val = fadd half %a.val, %b.val
+ store half %r.val, ptr addrspace(1) %r
+ call void @funcs_dyn_lds()
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
new file mode 100644
index 00000000000000..16bfe5f0196835
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK: amdhsa.kernels:
+; CHECK-NEXT: - .args:
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: r
+; CHECK-NEXT: .offset: 0
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: a
+; CHECK-NEXT: .offset: 8
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: b
+; CHECK-NEXT: .offset: 16
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .offset: 24
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_x
+; CHECK-NEXT: - .offset: 28
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_y
+; CHECK-NEXT: - .offset: 32
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_z
+; CHECK-NEXT: - .offset: 36
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_x
+; CHECK-NEXT: - .offset: 38
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_y
+; CHECK-NEXT: - .offset: 40
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_z
+; CHECK-NEXT: - .offset: 42
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_x
+; CHECK-NEXT: - .offset: 44
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_y
+; CHECK-NEXT: - .offset: 46
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_z
+; CHECK-NEXT: - .offset: 64
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_x
+; CHECK-NEXT: - .offset: 72
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_y
+; CHECK-NEXT: - .offset: 80
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_z
+; CHECK-NEXT: - .offset: 88
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_grid_dims
+; CHECK-NEXT: - .offset: 96
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_printf_buffer
+; CHECK-NEXT: - .offset: 104
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
+; CHECK-NEXT: - .offset: 112
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
+; CHECK-NEXT: - .offset: 120
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_heap_v1
+; CHECK-NEXT: - .offset: 128
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_default_queue
+; CHECK-NEXT: - .offset: 136
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_completion_action
+; CHECK: - .offset: 144
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
+; CHECK: - .offset: 224
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_queue_ptr
+
+; CHECK: .name: test_v5
+; CHECK: .symbol: test_v5.kd
+
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
+ store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_v5(
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a,
+ ptr addrspace(1) %b) #0 {
+entry:
+ %a.val = load half, ptr addrspace(1) %a
+ %b.val = load half, ptr addrspace(1) %b
+ %r.val = fadd half %a.val, %b.val
+ store half %r.val, ptr addrspace(1) %r
+ call void @funcs_dyn_lds(ptr addrspace(3) @lds)
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
>From b05f1ef28ec370a7075fee8be03a76b0a21ce991 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Thu, 28 Dec 2023 20:30:26 +0530
Subject: [PATCH 3/3] [AMDGPU] Update UsesDynamicLDS when LDS pointer is passed
as kernel argument
---
.../Target/AMDGPU/AMDGPUMachineFunction.cpp | 13 +-
...-metadata-dynlds-kernarg-hidden-args-v5.ll | 125 ++++++++++++++++++
2 files changed, 137 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 9e1e4c39915788..36deda8c72e768 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -28,6 +28,17 @@ getKernelDynLDSGlobalFromFunction(const Function &F) {
return M->getNamedGlobal(KernelDynLDSName);
}
+static bool hasLDSKernelArgument(const Function &F) {
+ for (const Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
+ if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+ }
+ return false;
+}
+
AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
const AMDGPUSubtarget &ST)
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -76,7 +87,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
- if (DynLdsGlobal)
+ if (DynLdsGlobal || hasLDSKernelArgument(F))
UsesDynamicLDS = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
new file mode 100644
index 00000000000000..d457c61b8d4081
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK: amdhsa.kernels:
+; CHECK-NEXT: - .args:
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: r
+; CHECK-NEXT: .offset: 0
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: a
+; CHECK-NEXT: .offset: 8
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: global
+; CHECK-NEXT: .name: b
+; CHECK-NEXT: .offset: 16
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: global_buffer
+; CHECK-NEXT: - .address_space: local
+; CHECK-NEXT: .name: lds_ptr
+; CHECK-NEXT: .offset: 24
+; CHECK-NEXT: .pointee_align: 1
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: dynamic_shared_pointer
+; CHECK-NEXT: - .offset: 32
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_x
+; CHECK-NEXT: - .offset: 36
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_y
+; CHECK-NEXT: - .offset: 40
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_block_count_z
+; CHECK-NEXT: - .offset: 44
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_x
+; CHECK-NEXT: - .offset: 46
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_y
+; CHECK-NEXT: - .offset: 48
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_group_size_z
+; CHECK-NEXT: - .offset: 50
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_x
+; CHECK-NEXT: - .offset: 52
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_y
+; CHECK-NEXT: - .offset: 54
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_remainder_z
+; CHECK-NEXT: - .offset: 72
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_x
+; CHECK-NEXT: - .offset: 80
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_y
+; CHECK-NEXT: - .offset: 88
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_global_offset_z
+; CHECK-NEXT: - .offset: 96
+; CHECK-NEXT: .size: 2
+; CHECK-NEXT: .value_kind: hidden_grid_dims
+; CHECK-NEXT: - .offset: 104
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_printf_buffer
+; CHECK-NEXT: - .offset: 112
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
+; CHECK-NEXT: - .offset: 120
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
+; CHECK-NEXT: - .offset: 128
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_heap_v1
+; CHECK-NEXT: - .offset: 136
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_default_queue
+; CHECK-NEXT: - .offset: 144
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_completion_action
+; CHECK: - .offset: 152
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
+; CHECK: - .offset: 232
+; CHECK-NEXT: .size: 8
+; CHECK-NEXT: .value_kind: hidden_queue_ptr
+
+; CHECK: .name: test_v5
+; CHECK: .symbol: test_v5.kd
+
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+
+define amdgpu_kernel void @test_v5(
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a,
+ ptr addrspace(1) %b,
+ ptr addrspace(3) %lds_ptr) #0 {
+entry:
+ %a.val = load half, ptr addrspace(1) %a
+ %b.val = load half, ptr addrspace(1) %b
+ %r.val = fadd half %a.val, %b.val
+ store half %r.val, ptr addrspace(1) %r
+ store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+
More information about the llvm-commits
mailing list