[Openmp-commits] [openmp] [Libomptarget] Handle dynamic stack sizes for AMD COV5 (PR #72606)
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 16 19:27:58 PST 2023
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/72606
Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.
>From 4179982f5713e83a3cccaf8eb9dd3fbc47903496 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 16 Nov 2023 20:22:38 -0600
Subject: [PATCH] [Libomptarget] Handle dynamic stack sizes for AMD COV5
Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.
---
.../plugins-nextgen/amdgpu/dynamic_hsa/hsa.h | 1 +
.../plugins-nextgen/amdgpu/src/rtl.cpp | 28 +++++++++++++++----
.../common/PluginInterface/PluginInterface.h | 3 +-
3 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 573a2ef8fc2005a..64a1d3308aed0bb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -288,6 +288,7 @@ typedef enum {
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
} hsa_executable_symbol_info_t;
typedef struct hsa_code_object_s {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..73c7986eb728d0f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
for (auto &Info : RequiredInfos) {
@@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// @return 56 for cov4 and 256 for cov5
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
+ /// Indicates whether or not we need to set up our own private segment size.
+ bool usesDynamicStack() const { return DynamicStack; }
+
private:
/// The kernel object to execute.
uint64_t KernelObject;
@@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
+ bool DynamicStack;
/// The size of implicit kernel arguments.
uint32_t ImplicitArgsSize;
@@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
+ uint32_t GroupSize, uint64_t StackSize,
+ AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
- Packet->private_segment_size = Kernel.getPrivateSize();
+ Packet->private_segment_size =
+ Kernel.usesDynamicStack() ? Kernel.getPrivateSize() : StackSize;
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
@@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize,
+ uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
@@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {
// Push the kernel with the output signal and an input signal (optional)
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
- GroupSize, OutputSignal, InputSignal);
+ GroupSize, StackSize, OutputSignal,
+ InputSignal);
}
/// Push an asynchronous memory copy between pinned memory buffers.
@@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
- Value = 0;
+ Value = StackSize;
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
+ StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
+
+ /// The current size of the stack that will be used in cases where it could
+ /// not be statically determined.
+ uint64_t StackSize = 1024 /* 1 KB */;
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3100,6 +3113,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GroupSize += MaxDynCGroupMem;
}
+ uint64_t StackSize;
+ GenericDevice.getDeviceStackSize(StackSize);
+
// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3138,7 +3154,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, ArgsMemoryManager);
+ GroupSize, StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index f09ae24163dfc2b..9174ecaab08ca00 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return 0;
}
+ virtual Error getDeviceStackSize(uint64_t &V) = 0;
+
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
- virtual Error getDeviceStackSize(uint64_t &V) = 0;
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
More information about the Openmp-commits
mailing list