[llvm] [Offload][AMDGPU] Correctly handle variable implicit argument sizes (PR #142199)

Joseph Huber via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 2 04:35:31 PDT 2025


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/142199

>From 21f93c3df0c680c7b7657c7ea1b922ef0eb5bd8a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 30 May 2025 13:46:37 -0500
Subject: [PATCH] [Offload][AMDGPU] Correctly handle variable implicit argument
 sizes

Summary:
The size of the implicit argument struct can vary depending on
optimizations, it is not always the size as listed by the full struct.
Additionally, the implicit arguments are always aligned on a pointer
boundary. This patch updates the handling to use the correctly aligned
offset and only initialize the members if they are contained in the
reported size.

Additionally, we modify the `alloc` and `free` routines to allow
`alloc(0)` and `free(nullptr)` as these are mandated by the C standard
and allow us to easily handle cases where the user calls a kernel with
no arguments.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 56 ++++++++++++-------
 .../amdgpu/utils/UtilitiesRTL.h               | 15 ++++-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 2733796611d9b..2a40af4a88014 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -420,7 +420,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
     assert(PtrStorage && "Invalid pointer storage");
 
     *PtrStorage = MemoryManager->allocate(Size, nullptr);
-    if (*PtrStorage == nullptr)
+    if (Size && *PtrStorage == nullptr)
       return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                            "failure to allocate from AMDGPU memory manager");
 
@@ -429,8 +429,6 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 
   /// Release an allocation to be reused.
   Error deallocate(void *Ptr) {
-    assert(Ptr && "Invalid pointer");
-
     if (MemoryManager->free(Ptr))
       return Plugin::error(ErrorCode::UNKNOWN,
                            "failure to deallocate from AMDGPU memory manager");
@@ -3365,7 +3363,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   if (ArgsSize != LaunchParams.Size &&
-      ArgsSize != LaunchParams.Size + getImplicitArgsSize())
+      ArgsSize > LaunchParams.Size + getImplicitArgsSize())
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                          "mismatch of kernel arguments size");
 
@@ -3401,23 +3399,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
-  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
-    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
-        utils::advancePtr(AllArgs, LaunchParams.Size));
-
-    // Set the COV5+ implicit arguments to the appropriate values.
-    std::memset(ImplArgs, 0, getImplicitArgsSize());
-    ImplArgs->BlockCountX = NumBlocks[0];
-    ImplArgs->BlockCountY = NumBlocks[1];
-    ImplArgs->BlockCountZ = NumBlocks[2];
-    ImplArgs->GroupSizeX = NumThreads[0];
-    ImplArgs->GroupSizeY = NumThreads[1];
-    ImplArgs->GroupSizeZ = NumThreads[2];
-    ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
-                             ? 3
-                             : 1 + (NumBlocks[1] * NumThreads[1] != 1);
-    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  uint64_t ImplArgsOffset = utils::roundUp(
+      LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
+  if (ArgsSize > ImplArgsOffset) {
+    hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
+        reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+            utils::advancePtr(AllArgs, ImplArgsOffset));
+
+    // Set the COV5+ implicit arguments to the appropriate values if present.
+    uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
+    std::memset(ImplArgs, 0, ImplArgsSize);
+
+    using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
+                           NumBlocks[0]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
+                           NumBlocks[1]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
+                           NumBlocks[2]);
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
+                           NumThreads[0]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
+                           NumThreads[1]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
+                           NumThreads[2]);
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
+                           NumBlocks[2] * NumThreads[2] > 1
+                               ? 3
+                               : 1 + (NumBlocks[1] * NumThreads[1] != 1));
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
+                           KernelArgs.DynCGroupMem);
   }
 
   // Push the kernel launch into the stream.
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 609ead942dbb3..77c756e006029 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -13,6 +13,7 @@
 #include <cstdint>
 
 #include "Shared/Debug.h"
+#include "Shared/Utils.h"
 #include "Utils/ELF.h"
 
 #include "omptarget.h"
@@ -26,7 +27,7 @@ namespace plugin {
 namespace hsa_utils {
 
 // The implicit arguments of COV5 AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
+struct alignas(alignof(void *)) AMDGPUImplicitArgsTy {
   uint32_t BlockCountX;
   uint32_t BlockCountY;
   uint32_t BlockCountZ;
@@ -60,6 +61,18 @@ inline Error readAMDGPUMetaDataFromImage(
   return Err;
 }
 
+/// Initializes the HSA implicit argument if the struct size permits it. This is
+/// necessary because optimizations can modify the size of the struct if
+/// portions of it are unused.
+template <typename MemberTy, typename T>
+void initImplArg(AMDGPUImplicitArgsTy *Base,
+                 MemberTy AMDGPUImplicitArgsTy::*Member, size_t AvailableSize,
+                 T Value) {
+  uint64_t Offset = utils::getPtrDiff(&(Base->*Member), Base);
+  if (Offset + sizeof(MemberTy) <= AvailableSize)
+    Base->*Member = static_cast<MemberTy>(Value);
+}
+
 } // namespace hsa_utils
 } // namespace plugin
 } // namespace target



More information about the llvm-commits mailing list