[Openmp-commits] [openmp] 47a3ad5 - [Libomptarget] Handle dynamic stack sizes for AMD COV5 (#72606)
via Openmp-commits
openmp-commits at lists.llvm.org
Mon Nov 20 10:48:47 PST 2023
Author: Joseph Huber
Date: 2023-11-20T12:48:42-06:00
New Revision: 47a3ad5be1c60fc0bd40bef5b53907bb1792b6e5
URL: https://github.com/llvm/llvm-project/commit/47a3ad5be1c60fc0bd40bef5b53907bb1792b6e5
DIFF: https://github.com/llvm/llvm-project/commit/47a3ad5be1c60fc0bd40bef5b53907bb1792b6e5.diff
LOG: [Libomptarget] Handle dynamic stack sizes for AMD COV5 (#72606)
Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.
Added:
Modified:
openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 573a2ef8fc2005a..64a1d3308aed0bb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -288,6 +288,7 @@ typedef enum {
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
} hsa_executable_symbol_info_t;
typedef struct hsa_code_object_s {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 8f64baa22cb39f0..aacc537401bd23e 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -475,6 +475,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
for (auto &Info : RequiredInfos) {
@@ -524,6 +525,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// @return 56 for cov4 and 256 for cov5
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
+ /// Indicates whether or not we need to set up our own private segment size.
+ bool usesDynamicStack() const { return DynamicStack; }
+
private:
/// The kernel object to execute.
uint64_t KernelObject;
@@ -532,6 +536,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
+ bool DynamicStack;
/// The size of implicit kernel arguments.
uint32_t ImplicitArgsSize;
@@ -660,7 +665,8 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
+ uint32_t GroupSize, uint64_t StackSize,
+ AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
@@ -697,7 +703,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
- Packet->private_segment_size = Kernel.getPrivateSize();
+ Packet->private_segment_size =
+ Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
@@ -1166,7 +1173,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize,
+ uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
@@ -1189,7 +1196,8 @@ struct AMDGPUStreamTy {
// Push the kernel with the output signal and an input signal (optional)
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
- GroupSize, OutputSignal, InputSignal);
+ GroupSize, StackSize, OutputSignal,
+ InputSignal);
}
/// Push an asynchronous memory copy between pinned memory buffers.
@@ -2610,10 +2618,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
- Value = 0;
+ Value = StackSize;
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
+ StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2769,6 +2778,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
+
+ /// The current size of the stack that will be used in cases where it could
+ /// not be statically determined.
+ uint64_t StackSize = 16 * 1024 /* 16 KB */;
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3142,6 +3155,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GroupSize += MaxDynCGroupMem;
}
+ uint64_t StackSize;
+ if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
+ return Err;
+
// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3180,7 +3197,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, ArgsMemoryManager);
+ GroupSize, StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index f09ae24163dfc2b..9174ecaab08ca00 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return 0;
}
+ virtual Error getDeviceStackSize(uint64_t &V) = 0;
+
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
- virtual Error getDeviceStackSize(uint64_t &V) = 0;
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
More information about the Openmp-commits
mailing list