[Openmp-commits] [openmp] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior (PR #74080)

Fri Dec 1 08:18:28 PST 2023

https://github.com/mhalk updated https://github.com/llvm/llvm-project/pull/74080

>From 9ca5835b386967ac8cdb70b67e57ca10637591df Mon Sep 17 00:00:00 2001
From: Michael Halkenhaeuser <MichaelGerald.Halkenhauser at amd.com>
Date: Fri, 1 Dec 2023 07:55:01 -0600
Subject: [PATCH] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP
 behavior

Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a
GFX-dependent value.

Changed several minor properties to be in sync with HIP:
  1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
  2. During AQL packet generation in case of a dyn callstack the maximum
     between user-provided and compiler-default is chosen.
  3. Make sure we only allow 32bit values for stack size.

Added testcase where a dynamic stack is required due to recursion.
---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 48 +++++++++--
 .../test/offloading/dynamic_callstack.c       | 80 +++++++++++++++++++
 2 files changed, 122 insertions(+), 6 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/dynamic_callstack.c

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 69acfa54e6c96a3..bb2693edf0a6963 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
   /// signal and can define an optional input signal (nullptr if none).
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUSignalTy *OutputSignal,
                          AMDGPUSignalTy *InputSignal) {
     assert(OutputSignal && "Invalid kernel output signal");
@@ -705,7 +705,8 @@ struct AMDGPUQueueTy {
     Packet->grid_size_y = 1;
     Packet->grid_size_z = 1;
     Packet->private_segment_size =
-        Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
+        Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
+                                  : Kernel.getPrivateSize();
     Packet->group_segment_size = GroupSize;
     Packet->kernel_object = Kernel.getKernelObject();
     Packet->kernarg_address = KernelArgs;
@@ -1174,7 +1175,7 @@ struct AMDGPUStreamTy {
   /// the kernel args buffer to the specified memory manager.
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUMemoryManagerTy &MemoryManager) {
     if (Queue == nullptr)
       return Plugin::error("Target queue was nullptr");
@@ -1872,6 +1873,25 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     else
       return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
 
+    // To determine the correct scratch memory size per thread, we need to check
+    // the device architecure generation. According to AOT_OFFLOADARCHS we may
+    // assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
+    // with a two char arch specialization. In-between is the 1-2 char
+    // generation number we want to extract.
+    StringRef Arch(ComputeUnitKind);
+    unsigned GfxGen = 0u;
+    if (!llvm::to_integer(Arch.slice(sizeof("gfx") - 1, Arch.size() - 2),
+                          GfxGen))
+      return Plugin::error("Invalid GFX architecture string");
+
+    // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
+    // But we need to divide by WavefrontSize.
+    // For generations pre-gfx11: use 13-bit field in units of 256-dword,
+    // otherwise: 15-bit field in units of 64-dword.
+    MaxThreadScratchSize = (GfxGen < 11)
+                               ? ((256 * 4) / WavefrontSize) * ((1 << 13) - 1)
+                               : ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
+
     // Get maximum number of workitems per workgroup.
     uint16_t WorkgroupMaxDim[3];
     if (auto Err =
@@ -2623,7 +2643,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
   Error setDeviceStackSize(uint64_t Value) override {
-    StackSize = Value;
+    if (Value > MaxThreadScratchSize) {
+      // Cap device scratch size.
+      MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
+              "%ld would exceed available resources.",
+              MaxThreadScratchSize, Value);
+      StackSize = MaxThreadScratchSize;
+    } else {
+      // Apply device scratch size, since it is within limits.
+      StackSize = Value;
+    }
+
     return Plugin::success();
   }
   Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2782,7 +2812,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
-  uint64_t StackSize = 16 * 1024 /* 16 KB */;
+  /// Default: 1024, in conformity to hipLimitStackSize.
+  uint64_t StackSize = 1024 /* 1 KiB */;
+
+  // The maximum scratch memory size per thread.
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
+  uint32_t MaxThreadScratchSize;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3198,7 +3233,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  GroupSize, static_cast<uint32_t>(StackSize),
+                                  ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/test/offloading/dynamic_callstack.c b/openmp/libomptarget/test/offloading/dynamic_callstack.c
new file mode 100644
index 000000000000000..9de30d7b7b690b0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/dynamic_callstack.c
@@ -0,0 +1,80 @@
+#include <omp.h>
+#include <stdio.h>
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
+
+// TODO: Realize the following run in an acceptable manner.
+//       Unfortunately with insufficient scratch mem size programs will hang.
+//       Therefore, a timeout mechanism would help tremendously.
+//       Additionally, we need to allow empty output / unsuccessful execution.
+
+// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
+// RUN?:   timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN?:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
+// RUN?:   --allow-empty
+
+// REQUIRES: amdgcn-amd-amdhsa
+
+// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
+// That is: stack requirement for main's target region may not be calculated.
+
+// This recursive function will eventually return 0.
+int recursiveFunc(const int Recursions) {
+  if (Recursions < 1)
+    return 0;
+
+  int j[Recursions];
+#pragma omp target private(j)
+  { ; }
+
+  return recursiveFunc(Recursions - 1);
+}
+
+int main() {
+  int N = 256;
+  int a[N];
+  int b[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    a[i] = 0;
+
+  for (i = 0; i < N; i++)
+    b[i] = i;
+
+#pragma omp target parallel for
+  {
+    for (int j = 0; j < N; j++)
+      a[j] = b[j] + recursiveFunc(j);
+  }
+
+  int rc = 0;
+  for (i = 0; i < N; i++)
+    if (a[i] != b[i]) {
+      rc++;
+      printf("Wrong value: a[%d]=%d\n", i, a[i]);
+    }
+
+  if (!rc)
+    printf("Success\n");
+
+  return rc;
+}
+
+/// CHECK: Success
+
+/// LIMIT_EXCEEDED: Scratch memory size will be set to
+/// LIMIT_EXCEEDED: Success
+
+/// LIMIT_INSUFFICIENT-NOT: Success