[Openmp-commits] [openmp] 1f583c6 - [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)
via Openmp-commits
openmp-commits at lists.llvm.org
Thu Mar 12 01:13:13 PDT 2026
Author: Kevin Sala Penades
Date: 2026-03-12T01:13:06-07:00
New Revision: 1f583c6dee360b0f5837a1026f2c594643cf885c
URL: https://github.com/llvm/llvm-project/commit/1f583c6dee360b0f5837a1026f2c594643cf885c
DIFF: https://github.com/llvm/llvm-project/commit/1f583c6dee360b0f5837a1026f2c594643cf885c.diff
LOG: [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)
Part 3 adding offload runtime support. See
https://github.com/llvm/llvm-project/pull/152651.
---------
Co-authored-by: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Added:
offload/test/offloading/dyn_groupprivate.cpp
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
offload/include/Shared/APITypes.h
offload/include/Shared/Environment.h
offload/include/device.h
offload/include/omptarget.h
offload/libomptarget/OpenMP/API.cpp
offload/libomptarget/exports
offload/plugins-nextgen/amdgpu/src/rtl.cpp
offload/plugins-nextgen/common/include/PluginInterface.h
offload/plugins-nextgen/common/src/PluginInterface.cpp
offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
offload/plugins-nextgen/cuda/src/rtl.cpp
offload/plugins-nextgen/host/src/rtl.cpp
offload/plugins-nextgen/level_zero/include/L0Kernel.h
offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
openmp/device/include/DeviceTypes.h
openmp/device/include/Interface.h
openmp/device/include/State.h
openmp/device/src/Kernel.cpp
openmp/device/src/State.cpp
openmp/runtime/src/dllexports
openmp/runtime/src/include/omp.h.var
openmp/runtime/src/kmp.h
openmp/runtime/src/kmp_csupport.cpp
openmp/runtime/src/kmp_global.cpp
openmp/runtime/src/kmp_stub.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 152a8f727310a..5fe7ee8997243 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
- Int32, Int32)
+ VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
#undef __OMP_STRUCT_TYPE
#undef OMP_STRUCT_TYPE
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..6183686290bd4 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,8 +102,9 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
- uint64_t Unused : 62;
- } Flags = {0, 0, 0};
+ uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
+ uint64_t Unused : 60;
+ } Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 79e45fd8e082d..142fba40340e6 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
DynamicEnvironmentTy *DynamicEnv = nullptr;
};
+/// The fallback types for the dynamic cgroup memory.
+enum class DynCGroupMemFallbackType : uint8_t {
+ /// None. Used for indicating that no fallback was triggered.
+ None = 0,
+ /// Abort the execution.
+ Abort = None,
+ /// Return null pointer.
+ Null = 1,
+ /// Allocate from a implementation defined memory space.
+ DefaultMem = 2
+};
+
struct KernelLaunchEnvironmentTy {
+ void *ReductionBuffer = nullptr;
+ void *DynCGroupMemFbPtr = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
- void *ReductionBuffer = nullptr;
+ uint32_t DynCGroupMemSize = 0;
+ DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
};
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index 4e27943d1dbc1..06d21397c7377 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -37,6 +37,8 @@
#include "PluginInterface.h"
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
+using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
+using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
// Forward declarations.
struct __tgt_bin_desc;
@@ -167,6 +169,20 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }
+ /// Get information from the device.
+ template <typename T> T getInfo(DeviceInfo Info) const {
+ InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
+
+ auto EntryOpt = DevInfo.get(Info);
+ if (!EntryOpt)
+ return 0;
+
+ auto Entry = *EntryOpt;
+ if (!std::holds_alternative<T>(Entry->Value))
+ return T{};
+ return std::get<T>(Entry->Value);
+ }
+
private:
/// Deinitialize the device (and plugin).
void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 867ba8d5e9f1e..40c16a4a7580f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -274,12 +274,23 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
+/// The OpenMP access group type. The criterion for grouping tasks using a
+/// specific grouping property.
+enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
+ omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
+};
+
void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_device_from_uid(const char *DeviceUid);
const char *omp_get_uid_from_device(int DeviceNum);
int omp_get_initial_device(void);
+size_t omp_get_gprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index dddd494fa7aab..6dcd94e48e987 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}
+EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
+ TIMESCOPE();
+ OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+ if (DeviceNum == omp_get_initial_device())
+ return 0;
+
+ if (AccessGroup != omp_access_cgroup)
+ return 0;
+
+ auto DeviceOrErr = PM->getDevice(DeviceNum);
+ if (!DeviceOrErr)
+ FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+ return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
+}
+
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index fccf57683b5b8..1831c43cc5f29 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -43,6 +43,7 @@ VERS1.0 {
omp_get_device_from_uid;
omp_get_uid_from_device;
omp_get_initial_device;
+ omp_get_gprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_accessible;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index bfd07b0919d08..37d7c6345f02e 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}
+ // Set the static block memory size required by the kernel.
+ StaticBlockMemSize = GroupSize;
+
// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy
@@ -3220,7 +3223,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -3755,6 +3758,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;
- // Account for user requested dynamic shared memory.
- uint32_t GroupSize = getGroupSize();
- if (uint32_t MaxDynCGroupMem = std::max(
- KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
- GroupSize += MaxDynCGroupMem;
- }
-
uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
@@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
+ // HSA requires the group segment size to include both static and dynamic.
+ uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
+
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, StackSize, ArgsMemoryManager);
+ TotalBlockMemSize, StackSize,
+ ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 83d20c620b96e..5ed3b57704da6 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -301,6 +301,18 @@ struct InfoTreeNode {
}
};
+/// Configuration of dynamic block memory needed for launching a kernel.
+struct DynBlockMemConfTy {
+ /// The size of the dynamic block memory buffer.
+ uint32_t Size = 0;
+ /// The size of dynamic shared memory natively provided by the device.
+ uint32_t NativeSize = 0;
+ /// The fallback that was triggered (if any).
+ DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
+ /// The fallback pointer if global memory was used as alternative.
+ void *FallbackPtr = nullptr;
+};
+
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -363,7 +375,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
- KernelArgsTy &KernelArgs,
+ uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
@@ -373,6 +385,9 @@ struct GenericKernelTy {
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }
+ /// Get the size of the static per-block memory consumed by the kernel.
+ uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
@@ -386,8 +401,10 @@ struct GenericKernelTy {
/// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *>
- getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
- AsyncInfoWrapperTy &AsyncInfo) const;
+ getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
+ const KernelArgsTy &KernelArgs,
+ const DynBlockMemConfTy &DynBlockMemConf,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -433,6 +450,12 @@ struct GenericKernelTy {
uint32_t NumBlocks[3]) const;
private:
+ /// Prepare the block memory buffer requested for the kernel and execute the
+ /// specified fallback if necessary.
+ Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs,
+ uint32_t NumBlocks) const;
+
/// Prepare the arguments before launching the kernel.
KernelLaunchParamsTy
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
@@ -487,6 +510,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
+ /// The static memory sized per block.
+ uint32_t StaticBlockMemSize = 0;
+
/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;
@@ -1498,6 +1524,9 @@ struct GenericPluginTy {
/// Query the current state of an asynchronous queue.
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+ /// Obtain information about the given device.
+ InfoTreeNode obtain_device_info(int32_t DeviceId);
+
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 21ba9db292c4c..a9af92826e633 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
- GenericDeviceTy &GenericDevice, uint32_t Version,
+ GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+ const DynBlockMemConfTy &DynBlockMemConf,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel
// launch environment.
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
- Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+ KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
return nullptr;
- if (!KernelEnvironment.Configuration.ReductionDataSize ||
- !KernelEnvironment.Configuration.ReductionBufferLength)
+ if ((!KernelEnvironment.Configuration.ReductionDataSize ||
+ !KernelEnvironment.Configuration.ReductionBufferLength) &&
+ KernelArgs.DynCGroupMem == 0)
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
- // TODO: Check if the kernel needs a launch environment.
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
/*HostPtr=*/nullptr,
TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
- {
+
+ LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
+ LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
+ LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
+ LocalKLE.ReductionBuffer = nullptr;
+
+ if (KernelEnvironment.Configuration.ReductionDataSize &&
+ KernelEnvironment.Configuration.ReductionBufferLength) {
auto AllocOrErr = GenericDevice.dataAlloc(
KernelEnvironment.Configuration.ReductionDataSize *
KernelEnvironment.Configuration.ReductionBufferLength,
@@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
return Plugin::success();
}
+Expected<DynBlockMemConfTy>
+GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs,
+ uint32_t NumBlocks) const {
+ uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
+ uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
+ uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
+ uint32_t DynNativeBlockMemSize = DynBlockMemSize;
+ void *DynFallbackPtr = nullptr;
+
+ // No enough block memory to cover the static one. Cannot run the kernel.
+ if (StaticBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Static block memory size exceeds maximum");
+ // No enough block memory to cover dynamic one, and the fallback is aborting.
+ if (static_cast<DynCGroupMemFallbackType>(
+ KernelArgs.Flags.DynCGroupMemFallback) ==
+ DynCGroupMemFallbackType::Abort &&
+ TotalBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(
+ ErrorCode::INVALID_ARGUMENT,
+ "Requested block memory size (static + dynamic) exceeds maximum");
+
+ DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
+ if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
+ // Launch without native dynamic block memory.
+ DynNativeBlockMemSize = 0;
+ DynFallback = static_cast<DynCGroupMemFallbackType>(
+ KernelArgs.Flags.DynCGroupMemFallback);
+ if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
+ // Do not provide any memory as fallback.
+ DynBlockMemSize = 0;
+ } else {
+ // Get global memory as fallback.
+ auto AllocOrErr = GenericDevice.dataAlloc(
+ NumBlocks * DynBlockMemSize,
+ /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+ if (!AllocOrErr)
+ return AllocOrErr.takeError();
+ DynFallbackPtr = *AllocOrErr;
+ }
+ }
+ return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
+ DynFallbackPtr};
+}
+
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptr
diff _t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
llvm::SmallVector<void *, 16> Args;
llvm::SmallVector<void *, 16> Ptrs;
+ uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
+ KernelArgs.ThreadLimit[1],
+ KernelArgs.ThreadLimit[2]};
+ uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
+ KernelArgs.NumTeams[2]};
+ if (!isBareMode()) {
+ NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+ NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+ NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+ }
+
+ auto DynBlockMemConfOrErr =
+ prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
+ if (!DynBlockMemConfOrErr)
+ return DynBlockMemConfOrErr.takeError();
+
+ DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
+ if (DynBlockMemConf.FallbackPtr)
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(
+ DynBlockMemConf.FallbackPtr);
+
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Args, Ptrs, *KernelLaunchEnvOrErr);
}
- uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
- KernelArgs.ThreadLimit[1],
- KernelArgs.ThreadLimit[2]};
- uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
- KernelArgs.NumTeams[2]};
- if (!isBareMode()) {
- NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
- NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
- NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
- }
-
// Record the kernel description after we modified the argument count and num
// blocks/threads.
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- LaunchParams, AsyncInfoWrapper);
+ return launchImpl(GenericDevice, NumThreads, NumBlocks,
+ DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
+ AsyncInfoWrapper);
}
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
return OFFLOAD_SUCCESS;
}
+InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
+ auto InfoOrErr = getDevice(DeviceId).obtainInfo();
+ if (auto Err = InfoOrErr.takeError()) {
+ REPORT() << "Failure to obtain device " << DeviceId
+ << " info: " << toString(std::move(Err));
+ return InfoTreeNode{};
+ }
+ return std::move(*InfoOrErr);
+}
+
void GenericPluginTy::print_device_info(int32_t DeviceId) {
if (auto Err = getDevice(DeviceId).printInfo())
REPORT() << "Failure to print device " << DeviceId
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 7e42c66dddabb..fa4f4634ecec3 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum {
typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
} CUfunction_attribute;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index c50c70a4456fa..c96cf3d89d3d4 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
+ int SharedMemSize;
+ Res = cuFuncGetAttribute(&SharedMemSize,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
+ if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
+ return Err;
+
+ // Set the static block memory size required by the kernel.
+ StaticBlockMemSize = SharedMemSize;
+
// Retrieve the size of the arguments.
return initArgsSize();
}
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy
@@ -197,7 +206,7 @@ struct CUDAKernelTy : public GenericKernelTy {
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
- mutable uint32_t MaxDynCGroupMemLimit = 49152;
+ mutable uint32_t MaxDynBlockMemSize = 49152;
/// The size of the kernel arguments.
size_t ArgsSize;
@@ -1411,7 +1420,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = CUDAKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -1455,6 +1464,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
- uint32_t MaxDynCGroupMem =
- std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
-
size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
@@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify();
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
// In case we require more memory than the current limit.
- if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+ if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(
- Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+ Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
if (auto Err = Plugin::check(
AttrResult,
"error in cuFuncSetAttribute while setting the memory limits: %s"))
return Err;
- MaxDynCGroupMemLimit = MaxDynCGroupMem;
+ MaxDynBlockMemSize = DynBlockMemSize;
}
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
NumThreads[0], NumThreads[1], NumThreads[2],
- MaxDynCGroupMem, Stream, nullptr, Config);
+ DynBlockMemSize, Stream, nullptr, Config);
// Register a callback to indicate when the kernel is complete.
if (GenericDevice.getRPCServer())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index df2b6f2c1dba9..d1c9af92a9fb0 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
if (!SupportsFFI)
return Plugin::error(ErrorCode::UNSUPPORTED,
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 1d5a014d9d0a5..50cdbd8390a9d 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -124,8 +124,8 @@ class L0KernelTy : public GenericKernelTy {
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
/// Launch the L0 kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
Error deinit() {
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b608e6ffe7931..1bffbbcd2fe92 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
- KernelArgsTy &KernelArgs,
+ uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ if (DynBlockMemSize > 0)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "dynamic shared memory is unsupported in L0 plugin");
+
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
new file mode 100644
index 0000000000000..fd0c3de0c8c5d
--- /dev/null
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -0,0 +1,199 @@
+// RUN: %libomptarget-compilexx-generic -fopenmp-version=61
+// RUN: %libomptarget-run-generic | %fcheck-generic
+// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
+// RUN: %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: gpu
+// UNSUPPORTED: intelgpu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 512
+
+int main() {
+ int Result[N], NumThreads;
+
+// Verify the groupprivate buffer works as expected.
+#pragma omp target teams num_teams(1) thread_limit(N) \
+ dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \
+ map(from : Result, NumThreads)
+ {
+ int Buffer[N];
+#pragma omp parallel
+ {
+ int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
+ int TId = omp_get_thread_num();
+ if (TId == 0)
+ NumThreads = omp_get_num_threads();
+ Buffer[TId] = 7;
+ DynBuffer[TId] = 3;
+#pragma omp barrier
+ int WrappedTId = (TId + 37) % NumThreads;
+ Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
+ }
+ }
+
+ if (NumThreads < N / 2 || NumThreads > N) {
+ printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
+ NumThreads);
+ return -1;
+ }
+
+ int Failed = 0;
+ for (int i = 0; i < NumThreads; ++i) {
+ if (Result[i] != 7 + 3) {
+ printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
+ ++Failed;
+ }
+ }
+
+ // Verify that the routines in the host returns NULL and zero.
+ if (omp_get_dyn_gprivate_ptr())
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+
+ size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
+ size_t ExceededSize = MaxSize + 10;
+
+// Verify that the fallback(default_mem) modifier works.
+#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
+ map(tofrom : Failed)
+ {
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != ExceededSize)
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
+ ++Failed;
+ }
+
+// Verify that the fallback(null) modifier works.
+#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
+ map(tofrom : Failed)
+ {
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
+ }
+
+// Verify that the default modifier is fallback(default_mem).
+#pragma omp target dyn_groupprivate(ExceededSize)
+ {
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != ExceededSize)
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
+ ++Failed;
+ }
+
+// Verify that the fallback(abort) modifier works.
+#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
+ {
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != N)
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
+ ++Failed;
+ }
+
+// Verify that the fallback(default_mem) does not trigger when not needed.
+#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
+ map(tofrom : Failed)
+ {
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != N)
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
+ {
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size and
+// fallback(default_mem).
+#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
+ map(tofrom : Failed)
+ {
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
+ }
+
+// Verify that omitting the clause is the same as setting zero size.
+#pragma omp target map(tofrom : Failed)
+ {
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
+ }
+
+ // CHECK: PASS
+ if (!Failed)
+ printf("PASS\n");
+}
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index fab6dbde5260d..57fb945b5a647 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t {
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
} omp_allocator_handle_t;
+typedef enum omp_memspace_handle_t {
+ omp_null_mem_space = 0,
+ omp_default_mem_space = 99,
+ omp_large_cap_mem_space = 1,
+ omp_const_mem_space = 2,
+ omp_high_bw_mem_space = 3,
+ omp_low_lat_mem_space = 4,
+ omp_cgroup_mem_space = 5,
+ KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
+} omp_memspace_handle_t;
+
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
///}
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
+enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
+ omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
+};
+
#endif
diff --git a/openmp/device/include/Interface.h b/openmp/device/include/Interface.h
index 71c3b1fc06d40..6a33ea2432c89 100644
--- a/openmp/device/include/Interface.h
+++ b/openmp/device/include/Interface.h
@@ -226,7 +226,7 @@ struct KernelEnvironmentTy;
int8_t __kmpc_is_spmd_exec_mode();
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
void __kmpc_target_deinit();
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index 31dc1540d7dd4..d3cd3d981e29d 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 05af35d242ac5..a180df7b982e3 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char {
};
static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) {
- inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+ KernelLaunchEnvironment);
synchronize::threadsAligned(atomic::relaxed);
} else {
- inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+ KernelLaunchEnvironment);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
}
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 985e6b169137f..243af1f2cb5e2 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -40,6 +40,10 @@ using namespace ompx;
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
KernelLaunchEnvironmentPtr;
+/// The pointer type for dynamic shared memory. This is important to keep
+/// the alignment and address space information.
+using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
+
///}
namespace {
@@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
}
+/// Manager of the dynamic cgroup memory buffer.
+struct DynCGroupMemTy {
+ /// Initialize the manager with the information from the kernel launch
+ /// enviornment and the pointer to the native shared memory buffer.
+ void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
+ // Initialize default values.
+ NativeOrNullPtr = nullptr;
+ FallbackPtr = nullptr;
+ Size = 0;
+ Fallback = DynCGroupMemFallbackType::None;
+ if (!KLE)
+ return;
+
+ // Initialize values using the kernel launch environment.
+ Size = KLE->DynCGroupMemSize;
+ Fallback = KLE->DynCGroupMemFb;
+ if (Size && Fallback == DynCGroupMemFallbackType::None)
+ NativeOrNullPtr = NativePtr;
+ if (Fallback == DynCGroupMemFallbackType::DefaultMem)
+ FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
+ Size * mapping::getBlockIdInKernel();
+ }
+
+ /// Get the memory space of the buffer.
+ omp_memspace_handle_t getMemSpace() const {
+ if (Size == 0)
+ return omp_null_mem_space;
+ if (Fallback == DynCGroupMemFallbackType::None)
+ return omp_cgroup_mem_space;
+ return omp_default_mem_space;
+ }
+
+ /// Get the size of the buffer.
+ size_t getSize() const { return Size; }
+
+ /// Get the native pointer or null if it was a fallback.
+ SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
+
+ /// Get the native pointer or the fallback pointer.
+ unsigned char *getNativeOrFallbackPtr() const {
+ return (Fallback == DynCGroupMemFallbackType::DefaultMem)
+ ? FallbackPtr
+ : getNativeOrNullPtr();
+ }
+
+private:
+ SharedMemPtrTy NativeOrNullPtr;
+ unsigned char *FallbackPtr;
+ size_t Size;
+ DynCGroupMemFallbackType Fallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
} // namespace
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KLE) {
SharedMemorySmartStack.init(IsSPMD);
+
+ if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+ KLE = nullptr;
+
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+ DynCGroupMem.init(KLE, DynamicSharedBuffer);
TeamState.init(IsSPMD);
ThreadStates = nullptr;
KernelEnvironmentPtr = &KernelEnvironment;
- KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+ KernelLaunchEnvironmentPtr = KLE;
}
}
@@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; }
int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
+ return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
+}
+
+void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
+ unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
+ // Ensure the alignment and address space information is kept.
+ Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
+ return (SharedMemPtrTy)(Ptr + Offset);
+}
+
+size_t omp_get_dyn_gprivate_size(omp_access_t) {
+ return DynCGroupMem.getSize();
+}
+
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
+ return DynCGroupMem.getMemSpace();
+}
}
extern "C" {
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 00becd1a657fd..8a70f8bc6d20c 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890
llvm_omp_target_shared_mem_space DATA
llvm_omp_target_device_mem_space DATA
omp_null_mem_space DATA
+ omp_cgroup_mem_space DATA
%ifndef stub
# Ordinals between 900 and 999 are reserved
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index e98df731ad888..be309727ba090 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -380,6 +380,11 @@
omp_uintptr_t value;
} omp_alloctrait_t;
+ typedef enum {
+ omp_access_cgroup = 0,
+ omp_access_pteam = 1
+ } omp_access_t;
+
# if defined(_WIN32)
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
typedef omp_uintptr_t omp_allocator_handle_t;
@@ -403,6 +408,7 @@
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
+ extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
@@ -439,6 +445,7 @@
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ omp_cgroup_mem_space = 5,
llvm_omp_target_host_mem_space = 100,
llvm_omp_target_shared_mem_space = 101,
llvm_omp_target_device_mem_space = 102,
@@ -463,6 +470,11 @@
omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
+ extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +485,11 @@
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
+ extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
# endif
/* OpenMP TR11 routines to get memory spaces and allocators */
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 36c40abaf1ef4..19deaef75415d 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
extern omp_memspace_handle_t const omp_const_mem_space;
extern omp_memspace_handle_t const omp_high_bw_mem_space;
extern omp_memspace_handle_t const omp_low_lat_mem_space;
+extern omp_memspace_handle_t const omp_cgroup_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index a92fc46374c27..8aa9a9caa924b 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
}
/* end of OpenMP 5.1 Memory Management routines */
+void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
+ return NULL;
+}
+
+void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
+ return NULL;
+}
+
+size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
+
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
+ return omp_null_mem_space;
+}
+
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
__kmp_serial_initialize();
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 6c3b576cab405..c6fdcf824af92 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4;
+omp_memspace_handle_t const omp_cgroup_mem_space =
+ (omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 06276d1bed1c7..4c1e6099574a6 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4;
+omp_memspace_handle_t const omp_cgroup_mem_space =
+ (omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
@@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
#endif
}
+void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
+ i;
+ return NULL;
+}
+
+void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
+ i;
+ return NULL;
+}
+
+size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
+ i;
+ return 0;
+}
+
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
+ i;
+ return omp_null_mem_space;
+}
+
+size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
+ i;
+ return 0;
+}
+
/* OpenMP 5.0 Affinity Format */
void omp_set_affinity_format(char const *format) { i; }
size_t omp_get_affinity_format(char *buffer, size_t size) {
More information about the Openmp-commits
mailing list