[Openmp-commits] [openmp] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior (PR #74080)
Michael Halkenhäuser via Openmp-commits
openmp-commits at lists.llvm.org
Fri Dec 1 06:07:55 PST 2023
https://github.com/mhalk created https://github.com/llvm/llvm-project/pull/74080
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value.
Changed several minor properties to be in sync with HIP:
1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen.
3. Make sure we only allow 32bit values for stack size.
Added testcase where a dynamic stack is required due to recursion.
>From 444628c553af869ca08b19925d681282df30aa79 Mon Sep 17 00:00:00 2001
From: Michael Halkenhaeuser <MichaelGerald.Halkenhauser at amd.com>
Date: Fri, 1 Dec 2023 07:55:01 -0600
Subject: [PATCH] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP
behavior
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a
GFX-dependent value.
Changed several minor properties to be in sync with HIP:
1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
2. During AQL packet generation in case of a dyn callstack the maximum
between user-provided and compiler-default is chosen.
3. Make sure we only allow 32bit values for stack size.
Added testcase where a dynamic stack is required due to recursion.
---
.../plugins-nextgen/amdgpu/src/rtl.cpp | 61 ++++++++++++--
.../test/offloading/dynamic_callstack.c | 80 +++++++++++++++++++
2 files changed, 135 insertions(+), 6 deletions(-)
create mode 100644 openmp/libomptarget/test/offloading/dynamic_callstack.c
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 69acfa54e6c96a3..34d5ebbf1173bac 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, uint64_t StackSize,
+ uint32_t GroupSize, uint32_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
@@ -705,7 +705,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size =
- Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
+ Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
+ : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
@@ -1174,7 +1175,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, uint64_t StackSize,
+ uint32_t GroupSize, uint32_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
@@ -1872,6 +1873,38 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
else
return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
+ // To determine the correct scratch memory size per thread, we need to check
+ // the device architecure generation. According to AOT_OFFLOADARCHS we may
+ // assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
+ // with a two char arch specialization. In-between is the 1-2 char
+ // generation number we want to extract.
+ std::string CUKind{ComputeUnitKind};
+ for (auto &C : CUKind)
+ C = (char)std::tolower(C);
+
+ int GfxGen = 0;
+ if ((CUKind.find("gfx") == 0) && CUKind.length() > 5 &&
+ CUKind.length() < 8) {
+ // Cut away suffix & prefix.
+ CUKind.erase(CUKind.length() - 2, 2);
+ CUKind.erase(0, 3);
+ // Make sure we only convert digits to a number.
+ if (std::find_if(CUKind.begin(), CUKind.end(), [](unsigned char c) {
+ return !std::isdigit(c);
+ }) == CUKind.end())
+ GfxGen = std::stoi(CUKind);
+ }
+
+ // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
+ // But we need to divide by WavefrontSize.
+ if (GfxGen < 11) {
+ // 13-bit field in units of 256-dword.
+ MaxThreadScratchSize = ((256 * 4) / WavefrontSize) * ((1 << 13) - 1);
+ } else {
+ // 15-bit field in units of 64-dword.
+ MaxThreadScratchSize = ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
+ }
+
// Get maximum number of workitems per workgroup.
uint16_t WorkgroupMaxDim[3];
if (auto Err =
@@ -2623,7 +2656,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
- StackSize = Value;
+ if (Value > MaxThreadScratchSize) {
+ // Cap device scratch size.
+ MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
+ "%ld would exceed available resources.",
+ MaxThreadScratchSize, Value);
+ StackSize = MaxThreadScratchSize;
+ } else {
+ // Apply device scratch size, since it is within limits.
+ StackSize = Value;
+ }
+
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2782,7 +2825,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
- uint64_t StackSize = 16 * 1024 /* 16 KB */;
+ /// Default: 1024, in conformity to hipLimitStackSize.
+ uint64_t StackSize = 1024 /* 1 KiB */;
+
+ // The maximum scratch memory size per thread.
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
+ uint32_t MaxThreadScratchSize;
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3198,7 +3246,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, StackSize, ArgsMemoryManager);
+ GroupSize, static_cast<uint32_t>(StackSize),
+ ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/test/offloading/dynamic_callstack.c b/openmp/libomptarget/test/offloading/dynamic_callstack.c
new file mode 100644
index 000000000000000..9de30d7b7b690b0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/dynamic_callstack.c
@@ -0,0 +1,80 @@
+#include <omp.h>
+#include <stdio.h>
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
+// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN: | %fcheck-amdgcn-amd-amdhsa
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
+// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
+
+// TODO: Realize the following run in an acceptable manner.
+// Unfortunately with insufficient scratch mem size programs will hang.
+// Therefore, a timeout mechanism would help tremendously.
+// Additionally, we need to allow empty output / unsuccessful execution.
+
+// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
+// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
+// RUN?: --allow-empty
+
+// REQUIRES: amdgcn-amd-amdhsa
+
+// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
+// That is: stack requirement for main's target region may not be calculated.
+
+// This recursive function will eventually return 0.
+int recursiveFunc(const int Recursions) {
+ if (Recursions < 1)
+ return 0;
+
+ int j[Recursions];
+#pragma omp target private(j)
+ { ; }
+
+ return recursiveFunc(Recursions - 1);
+}
+
+int main() {
+ int N = 256;
+ int a[N];
+ int b[N];
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j] + recursiveFunc(j);
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+/// CHECK: Success
+
+/// LIMIT_EXCEEDED: Scratch memory size will be set to
+/// LIMIT_EXCEEDED: Success
+
+/// LIMIT_INSUFFICIENT-NOT: Success
More information about the Openmp-commits
mailing list