[llvm-branch-commits] [llvm] [openmp] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (PR #152831)
Kevin Sala Penades via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Aug 9 22:56:10 PDT 2025
https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/152831
>From fa3c7425ae9e5ffea83841f2be61b0f494b99038 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 8 Aug 2025 11:25:14 -0700
Subject: [PATCH 1/2] [OpenMP][Offload] Add offload runtime support for
dyn_groupprivate clause
---
offload/DeviceRTL/include/DeviceTypes.h | 4 +
offload/DeviceRTL/include/Interface.h | 2 +-
offload/DeviceRTL/include/State.h | 2 +-
offload/DeviceRTL/src/Kernel.cpp | 14 +-
offload/DeviceRTL/src/State.cpp | 48 +++++-
offload/include/Shared/APITypes.h | 6 +-
offload/include/Shared/Environment.h | 4 +-
offload/include/device.h | 3 +
offload/include/omptarget.h | 7 +-
offload/libomptarget/OpenMP/API.cpp | 14 ++
offload/libomptarget/device.cpp | 6 +
offload/libomptarget/exports | 1 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 1 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 34 +++--
.../common/include/PluginInterface.h | 33 +++-
.../common/src/PluginInterface.cpp | 86 ++++++++---
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 +
offload/plugins-nextgen/cuda/src/rtl.cpp | 37 +++--
offload/plugins-nextgen/host/src/rtl.cpp | 4 +-
.../offloading/dyn_groupprivate_strict.cpp | 141 ++++++++++++++++++
openmp/runtime/src/include/omp.h.var | 10 ++
openmp/runtime/src/kmp_csupport.cpp | 9 ++
openmp/runtime/src/kmp_stub.cpp | 16 ++
23 files changed, 418 insertions(+), 65 deletions(-)
create mode 100644 offload/test/offloading/dyn_groupprivate_strict.cpp
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d92380f040..a43b506d6879e 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t {
///}
+enum omp_access_t {
+ omp_access_cgroup = 0,
+};
+
#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4..672afea206785 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
int8_t __kmpc_is_spmd_exec_mode();
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
void __kmpc_target_deinit();
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index db396dae6e445..17c3c6f2d3e42 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..58e9a09105a76 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
};
static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) {
- inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+ KernelLaunchEnvironment);
synchronize::threadsAligned(atomic::relaxed);
} else {
- inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+ KernelLaunchEnvironment);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 62b03e7bba720..9e2a9999167b4 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
}
+struct DynCGroupMemTy {
+ void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+ Size = 0;
+ Ptr = nullptr;
+ IsFallback = false;
+ if (KLE) {
+ Size = KLE->DynCGroupMemSize;
+ if (void *Fallback = KLE->DynCGroupMemFallback) {
+ Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+ IsFallback = true;
+ } else {
+ Ptr = static_cast<char *>(NativeDynCGroup);
+ }
+ }
+ }
+
+ char *getPtr(size_t Offset) const { return Ptr + Offset; }
+ bool isFallback() const { return IsFallback; }
+ size_t getSize() const { return Size; }
+
+private:
+ char *Ptr;
+ size_t Size;
+ bool IsFallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
} // namespace
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KLE) {
SharedMemorySmartStack.init(IsSPMD);
+
+ if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+ KLE = nullptr;
+
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+ DynCGroupMem.init(KLE, DynamicSharedBuffer);
TeamState.init(IsSPMD);
ThreadStates = nullptr;
KernelEnvironmentPtr = &KernelEnvironment;
- KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+ KernelLaunchEnvironmentPtr = KLE;
}
}
@@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; }
int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
+ omp_access_t) {
+ if (IsFallback != NULL)
+ *IsFallback = DynCGroupMem.isFallback();
+ return DynCGroupMem.getPtr(Offset);
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+ return DynCGroupMem.getSize();
+}
}
extern "C" {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..0ef2dd162292b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -97,8 +97,10 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
- uint64_t Unused : 62;
- } Flags = {0, 0, 0};
+ uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
+ // mem fallback.
+ uint64_t Unused : 61;
+ } Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..0670ac1090da4 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
};
struct KernelLaunchEnvironmentTy {
+ void *ReductionBuffer = nullptr;
+ void *DynCGroupMemFallback = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
- void *ReductionBuffer = nullptr;
+ uint32_t DynCGroupMemSize = 0;
};
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..0e93cf8ec1a8b 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }
+ /// Get the maximum shared memory per team for any kernel.
+ uint64_t getMaxSharedTeamMemory();
+
private:
/// Deinitialize the device (and plugin).
void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..45bb74ec367d6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr,
- 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
+ 0, {0,0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
struct DeviceTy;
@@ -273,10 +273,15 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
+typedef enum {
+ omp_access_cgroup = 0,
+} omp_access_t;
+
void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 4576f9bd06121..1ed4192157fc8 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}
+EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup) {
+ TIMESCOPE();
+ OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+ if (DeviceNum == omp_get_initial_device())
+ return 0;
+
+ auto DeviceOrErr = PM->getDevice(DeviceNum);
+ if (!DeviceOrErr)
+ FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+ return DeviceOrErr->getMaxSharedTeamMemory();
+}
+
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..31bfc7d092424 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
return false;
return RTL->use_auto_zero_copy(RTLDeviceID);
}
+
+uint64_t DeviceTy::getMaxSharedTeamMemory() {
+ using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
+ return RTL->query_device_info(
+ RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2406776c1fb5f..b5a1401564d58 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,7 @@ VERS1.0 {
omp_get_num_devices;
omp_get_device_num;
omp_get_initial_device;
+ omp_get_groupprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_present;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e35896..2cf156e576c5f 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,6 +52,7 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+ HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;
typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..fa373c2029f0c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;
-
return Plugin::success();
}
@@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}
+ StaticBlockMemSize = GroupSize;
+
// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
@@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;
+ // Retrieve the size of the group memory.
+ for (const auto *Pool : AllMemoryPools) {
+ if (Pool->isGroup()) {
+ size_t Size = 0;
+ if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+ return Err;
+ MaxBlockSharedMemSize = Size;
+ break;
+ }
+ }
+
+ // Supports block shared memory natively.
+ HasNativeBlockSharedMem = true;
+
return Plugin::success();
}
@@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;
- // Account for user requested dynamic shared memory.
- uint32_t GroupSize = getGroupSize();
- if (uint32_t MaxDynCGroupMem = std::max(
- KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
- GroupSize += MaxDynCGroupMem;
- }
-
uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
@@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, StackSize, ArgsMemoryManager);
+ getStaticBlockMemSize() + DynBlockMemSize,
+ StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..3357ccfe0c9b5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -226,6 +226,10 @@ struct InfoTreeNode {
}
};
+enum class DeviceQueryKind {
+ DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
+};
+
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -312,13 +316,16 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
- KernelArgsTy &KernelArgs,
+ uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }
+ /// Get the size of the static per-block memory consumed by the kernel.
+ uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
@@ -331,9 +338,9 @@ struct GenericKernelTy {
}
/// Return a device pointer to a new kernel launch environment.
- Expected<KernelLaunchEnvironmentTy *>
- getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
- AsyncInfoWrapperTy &AsyncInfo) const;
+ Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
+ GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+ void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -425,6 +432,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
+ /// The static memory sized per block.
+ uint32_t StaticBlockMemSize = 0;
+
/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;
@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// this id is not unique between different plugins; they may overlap.
int32_t getDeviceId() const { return DeviceId; }
+ /// Get the total shared memory per block that can be used in any kernel.
+ uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
+ /// Indicate whether the device has native block shared memory.
+ bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
@@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
std::atomic<bool> OmptInitialized;
#endif
+ /// The total per-block shared memory that a kernel may use.
+ uint32_t MaxBlockSharedMemSize = 0;
+
+ /// Whether the device has native block shared memory.
+ bool HasNativeBlockSharedMem = false;
+
private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
@@ -1347,6 +1369,9 @@ struct GenericPluginTy {
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
+ /// Retrieve information about the given device.
+ int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
+
/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..2997585e1660f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
- GenericDeviceTy &GenericDevice, uint32_t Version,
- AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+ void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel
// launch environment.
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
- Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+ KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
return nullptr;
- if (!KernelEnvironment.Configuration.ReductionDataSize ||
- !KernelEnvironment.Configuration.ReductionBufferLength)
+ if ((!KernelEnvironment.Configuration.ReductionDataSize ||
+ !KernelEnvironment.Configuration.ReductionBufferLength) &&
+ KernelArgs.DynCGroupMem == 0)
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
- // TODO: Check if the kernel needs a launch environment.
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
/*HostPtr=*/nullptr,
TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -504,7 +504,9 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
- {
+
+ if (KernelEnvironment.Configuration.ReductionDataSize &&
+ KernelEnvironment.Configuration.ReductionBufferLength) {
auto AllocOrErr = GenericDevice.dataAlloc(
KernelEnvironment.Configuration.ReductionDataSize *
KernelEnvironment.Configuration.ReductionBufferLength,
@@ -514,8 +516,13 @@ GenericKernelTy::getKernelLaunchEnvironment(
LocalKLE.ReductionBuffer = *AllocOrErr;
// Remember to free the memory later.
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+ } else {
+ LocalKLE.ReductionBuffer = nullptr;
}
+ LocalKLE.DynCGroupMemSize = KernelArgs.DynCGroupMem;
+ LocalKLE.DynCGroupMemFallback = FallbackBlockMem;
+
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
@@ -556,8 +563,45 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
llvm::SmallVector<void *, 16> Args;
llvm::SmallVector<void *, 16> Ptrs;
+ uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
+ KernelArgs.ThreadLimit[1],
+ KernelArgs.ThreadLimit[2]};
+ uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
+ KernelArgs.NumTeams[2]};
+ if (!isBareMode()) {
+ NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+ NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+ NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+ }
+
+ uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
+ uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
+ uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
+ if (StaticBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Static block memory size exceeds maximum");
+ else if (!KernelArgs.Flags.AllowDynCGroupMemFallback &&
+ TotalBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(
+ ErrorCode::INVALID_ARGUMENT,
+ "Static and dynamic block memory size exceeds maximum");
+
+ void *FallbackBlockMem = nullptr;
+ if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
+ TotalBlockMemSize > MaxBlockMemSize)) {
+ auto AllocOrErr = GenericDevice.dataAlloc(
+ NumBlocks[0] * DynBlockMemSize,
+ /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+ if (!AllocOrErr)
+ return AllocOrErr.takeError();
+
+ FallbackBlockMem = *AllocOrErr;
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(FallbackBlockMem);
+ DynBlockMemSize = 0;
+ }
+
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, FallbackBlockMem, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -573,17 +617,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Args, Ptrs, *KernelLaunchEnvOrErr);
}
- uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
- KernelArgs.ThreadLimit[1],
- KernelArgs.ThreadLimit[2]};
- uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
- KernelArgs.NumTeams[2]};
- if (!isBareMode()) {
- NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
- NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
- NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
- }
-
// Record the kernel description after we modified the argument count and num
// blocks/threads.
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -599,8 +632,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- LaunchParams, AsyncInfoWrapper);
+ return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemSize,
+ KernelArgs, LaunchParams, AsyncInfoWrapper);
}
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -2077,6 +2110,17 @@ void GenericPluginTy::print_device_info(int32_t DeviceId) {
toString(std::move(Err)).data());
}
+int64_t GenericPluginTy::query_device_info(int32_t DeviceId,
+ DeviceQueryKind Query) {
+ const GenericDeviceTy &Device = getDevice(DeviceId);
+
+ switch (Query) {
+ case DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM:
+ return Device.getMaxBlockSharedMemSize();
+ }
+ return 0;
+}
+
int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
auto Err = getDevice(DeviceId).createEvent(EventPtr);
if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8b..b6e087edea876 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -258,6 +258,7 @@ typedef enum CUdevice_attribute_enum {
typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
} CUfunction_attribute;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 15193de6ae430..eda7a85f750f0 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -148,13 +148,21 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
+ int SharedMemSize;
+ Res = cuFuncGetAttribute(&SharedMemSize,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
+ if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
+ return Err;
+
+ StaticBlockMemSize = SharedMemSize;
+
return Plugin::success();
}
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
private:
@@ -162,7 +170,7 @@ struct CUDAKernelTy : public GenericKernelTy {
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
- mutable uint32_t MaxDynCGroupMemLimit = 49152;
+ mutable uint32_t MaxDynBlockMemSize = 49152;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -358,6 +366,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Err;
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
+ uint32_t MaxSharedMem;
+ if (auto Err = getDeviceAttr(
+ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+ return Err;
+ MaxBlockSharedMemSize = MaxSharedMem;
+
+ // Supports block shared memory natively.
+ HasNativeBlockSharedMem = true;
+
return Plugin::success();
}
@@ -1239,7 +1256,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = CUDAKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -1285,6 +1302,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1294,9 +1312,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
- uint32_t MaxDynCGroupMem =
- std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
-
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
@@ -1308,18 +1323,18 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GenericDevice.Plugin.getRPCServer().Thread->notify();
// In case we require more memory than the current limit.
- if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+ if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(
- Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+ Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
Plugin::check(
AttrResult,
"Error in cuLaunchKernel while setting the memory limits: %s");
- MaxDynCGroupMemLimit = MaxDynCGroupMem;
+ MaxDynBlockMemSize = DynBlockMemSize;
}
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
NumThreads[0], NumThreads[1], NumThreads[2],
- MaxDynCGroupMem, Stream, nullptr, Config);
+ DynBlockMemSize, Stream, nullptr, Config);
// Register a callback to indicate when the kernel is complete.
if (GenericDevice.getRPCServer())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572265b4c..dc82a2ef16e51 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -92,8 +92,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
diff --git a/offload/test/offloading/dyn_groupprivate_strict.cpp b/offload/test/offloading/dyn_groupprivate_strict.cpp
new file mode 100644
index 0000000000000..a35f8dd2b0595
--- /dev/null
+++ b/offload/test/offloading/dyn_groupprivate_strict.cpp
@@ -0,0 +1,141 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: gpu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 512
+
+int main() {
+ int Result[N], NumThreads;
+
+#pragma omp target teams num_teams(1) thread_limit(N) \
+ dyn_groupprivate(strict : N * sizeof(Result[0])) \
+ map(from : Result, NumThreads)
+ {
+ int Buffer[N];
+#pragma omp parallel
+ {
+ int *DynBuffer = (int *)omp_get_dyn_groupprivate_ptr();
+ int TId = omp_get_thread_num();
+ if (TId == 0)
+ NumThreads = omp_get_num_threads();
+ Buffer[TId] = 7;
+ DynBuffer[TId] = 3;
+#pragma omp barrier
+ int WrappedTId = (TId + 37) % NumThreads;
+ Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
+ }
+ }
+
+ if (NumThreads < N / 2 || NumThreads > N) {
+ printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
+ NumThreads);
+ return -1;
+ }
+
+ int Failed = 0;
+ for (int i = 0; i < NumThreads; ++i) {
+ if (Result[i] != 7 + 3) {
+ printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
+ ++Failed;
+ }
+ }
+
+ // Verify that the routines in the host returns NULL and zero.
+ if (omp_get_dyn_groupprivate_ptr())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+
+ size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup);
+ size_t ExceededSize = MaxSize + 10;
+
+// Verify that the fallback modifier works.
+#pragma omp target dyn_groupprivate(fallback : ExceededSize) \
+ map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != ExceededSize)
+ ++Failed;
+ if (!IsFallback)
+ ++Failed;
+ }
+
+// Verify that the default modifier is fallback.
+#pragma omp target dyn_groupprivate(ExceededSize)
+ {
+ }
+
+// Verify that the strict modifier works.
+#pragma omp target dyn_groupprivate(strict : N) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != N)
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the fallback does not trigger when not needed.
+#pragma omp target dyn_groupprivate(fallback : N) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != N)
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(strict : 0) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(fallback : 0) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that omitting the clause is the same as setting zero size.
+#pragma omp target map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+ // CHECK: PASS
+ if (!Failed)
+ printf("PASS\n");
+}
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 74f385feb3ea5..26c3df56a9ce3 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -380,6 +380,10 @@
omp_uintptr_t value;
} omp_alloctrait_t;
+ typedef enum {
+ omp_access_cgroup = 0,
+ } omp_access_t;
+
# if defined(_WIN32)
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
typedef omp_uintptr_t omp_allocator_handle_t;
@@ -463,6 +467,9 @@
omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset = 0, int *is_fallback = NULL, omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +480,9 @@
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group);
# endif
/* OpenMP TR11 routines to get memory spaces and allocators */
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 3ca32ba583fe2..9605bad457e11 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,6 +4515,15 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
}
/* end of OpenMP 5.1 Memory Management routines */
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+ omp_access_t access_group) {
+ if (is_fallback != NULL)
+ *is_fallback = 0;
+ return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { return 0; }
+
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
__kmp_serial_initialize();
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 06276d1bed1c7..a099f887b6ba4 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -454,6 +454,22 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
#endif
}
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+ omp_access_t access_group) {
+ i;
+ return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) {
+ i;
+ return 0;
+}
+
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group) {
+ i;
+ return 0;
+}
+
/* OpenMP 5.0 Affinity Format */
void omp_set_affinity_format(char const *format) { i; }
size_t omp_get_affinity_format(char *buffer, size_t size) {
>From f20f4ba2290c0966e86dae733ad025d2fb0995d2 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 9 Aug 2025 22:50:23 -0700
Subject: [PATCH 2/2] Add fixes
---
offload/DeviceRTL/include/DeviceTypes.h | 5 +++++
offload/DeviceRTL/src/State.cpp | 19 ++++++++++---------
offload/include/omptarget.h | 15 +++++++++++----
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++++
offload/plugins-nextgen/cuda/src/rtl.cpp | 4 ++++
openmp/runtime/src/kmp_csupport.cpp | 2 +-
6 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index a43b506d6879e..042fef45917b0 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,8 +163,13 @@ typedef enum omp_allocator_handle_t {
///}
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
};
#endif
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 9e2a9999167b4..c6bc6a140f5f2 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -163,14 +163,15 @@ struct DynCGroupMemTy {
Size = 0;
Ptr = nullptr;
IsFallback = false;
- if (KLE) {
- Size = KLE->DynCGroupMemSize;
- if (void *Fallback = KLE->DynCGroupMemFallback) {
- Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
- IsFallback = true;
- } else {
- Ptr = static_cast<char *>(NativeDynCGroup);
- }
+ if (!KLE)
+ return;
+
+ Size = KLE->DynCGroupMemSize;
+ if (void *Fallback = KLE->DynCGroupMemFallback) {
+ Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+ IsFallback = true;
+ } else {
+ Ptr = static_cast<char *>(NativeDynCGroup);
}
}
@@ -466,7 +467,7 @@ int omp_is_initial_device(void) { return 0; }
void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
omp_access_t) {
- if (IsFallback != NULL)
+ if (IsFallback != nullptr)
*IsFallback = DynCGroupMem.isFallback();
return DynCGroupMem.getPtr(Offset);
}
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 45bb74ec367d6..ddb0f7f88d2e0 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -273,15 +273,22 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
-typedef enum {
- omp_access_cgroup = 0,
-} omp_access_t;
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
+enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
+ omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
+};
void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
-size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
+size_t
+omp_get_groupprivate_limit(int device_num,
+ omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index fa373c2029f0c..9751169b09c60 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3441,6 +3441,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
getStaticBlockMemSize() + DynBlockMemSize,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index eda7a85f750f0..b052197e2aa6a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1322,6 +1322,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify();
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
// In case we require more memory than the current limit.
if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 9605bad457e11..3ac62e5893f8b 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4517,7 +4517,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
omp_access_t access_group) {
- if (is_fallback != NULL)
+ if (is_fallback != nullptr)
*is_fallback = 0;
return NULL;
}
More information about the llvm-branch-commits
mailing list