[llvm-branch-commits] [llvm] [openmp] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (PR #152831)

Sat Aug 9 22:56:10 PDT 2025

https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/152831

>From fa3c7425ae9e5ffea83841f2be61b0f494b99038 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 8 Aug 2025 11:25:14 -0700
Subject: [PATCH 1/2] [OpenMP][Offload] Add offload runtime support for
 dyn_groupprivate clause

---
 offload/DeviceRTL/include/DeviceTypes.h       |   4 +
 offload/DeviceRTL/include/Interface.h         |   2 +-
 offload/DeviceRTL/include/State.h             |   2 +-
 offload/DeviceRTL/src/Kernel.cpp              |  14 +-
 offload/DeviceRTL/src/State.cpp               |  48 +++++-
 offload/include/Shared/APITypes.h             |   6 +-
 offload/include/Shared/Environment.h          |   4 +-
 offload/include/device.h                      |   3 +
 offload/include/omptarget.h                   |   7 +-
 offload/libomptarget/OpenMP/API.cpp           |  14 ++
 offload/libomptarget/device.cpp               |   6 +
 offload/libomptarget/exports                  |   1 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |   1 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  34 +++--
 .../common/include/PluginInterface.h          |  33 +++-
 .../common/src/PluginInterface.cpp            |  86 ++++++++---
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |   1 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  37 +++--
 offload/plugins-nextgen/host/src/rtl.cpp      |   4 +-
 .../offloading/dyn_groupprivate_strict.cpp    | 141 ++++++++++++++++++
 openmp/runtime/src/include/omp.h.var          |  10 ++
 openmp/runtime/src/kmp_csupport.cpp           |   9 ++
 openmp/runtime/src/kmp_stub.cpp               |  16 ++
 23 files changed, 418 insertions(+), 65 deletions(-)
 create mode 100644 offload/test/offloading/dyn_groupprivate_strict.cpp

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d92380f040..a43b506d6879e 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t {
 
 ///}
 
+enum omp_access_t {
+  omp_access_cgroup = 0,
+};
+
 #endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4..672afea206785 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
 int8_t __kmpc_is_spmd_exec_mode();
 
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 void __kmpc_target_deinit();
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index db396dae6e445..17c3c6f2d3e42 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+          KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 /// Return the kernel and kernel launch environment associated with the current
 /// kernel. The former is static and contains compile time information that
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..58e9a09105a76 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
 };
 
 static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+                  KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
 /// \param Ident               Source location identification, can be NULL.
 ///
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
   bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
   bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
   if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+                      KernelLaunchEnvironment);
     synchronize::threadsAligned(atomic::relaxed);
   } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+                      KernelLaunchEnvironment);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 62b03e7bba720..9e2a9999167b4 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 }
 
+struct DynCGroupMemTy {
+  void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+    Size = 0;
+    Ptr = nullptr;
+    IsFallback = false;
+    if (KLE) {
+      Size = KLE->DynCGroupMemSize;
+      if (void *Fallback = KLE->DynCGroupMemFallback) {
+        Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+        IsFallback = true;
+      } else {
+        Ptr = static_cast<char *>(NativeDynCGroup);
+      }
+    }
+  }
+
+  char *getPtr(size_t Offset) const { return Ptr + Offset; }
+  bool isFallback() const { return IsFallback; }
+  size_t getSize() const { return Size; }
+
+private:
+  char *Ptr;
+  size_t Size;
+  bool IsFallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
 } // namespace
 
 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 } // namespace
 
 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                 KernelLaunchEnvironmentTy *KLE) {
   SharedMemorySmartStack.init(IsSPMD);
+
+  if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+    KLE = nullptr;
+
   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+    DynCGroupMem.init(KLE, DynamicSharedBuffer);
     TeamState.init(IsSPMD);
     ThreadStates = nullptr;
     KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+    KernelLaunchEnvironmentPtr = KLE;
   }
 }
 
@@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 int omp_get_initial_device(void) { return -1; }
 
 int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
+                                   omp_access_t) {
+  if (IsFallback != NULL)
+    *IsFallback = DynCGroupMem.isFallback();
+  return DynCGroupMem.getPtr(Offset);
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+  return DynCGroupMem.getSize();
+}
 }
 
 extern "C" {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..0ef2dd162292b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -97,8 +97,10 @@ struct KernelArgsTy {
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
     uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
-    uint64_t Unused : 62;
-  } Flags = {0, 0, 0};
+    uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
+                                            // mem fallback.
+    uint64_t Unused : 61;
+  } Flags = {0, 0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
   // The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..0670ac1090da4 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
 };
 
 struct KernelLaunchEnvironmentTy {
+  void *ReductionBuffer = nullptr;
+  void *DynCGroupMemFallback = nullptr;
   uint32_t ReductionCnt = 0;
   uint32_t ReductionIterCnt = 0;
-  void *ReductionBuffer = nullptr;
+  uint32_t DynCGroupMemSize = 0;
 };
 
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..0e93cf8ec1a8b 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
   /// Indicate that there are pending images for this device or not.
   void setHasPendingImages(bool V) { HasPendingImages = V; }
 
+  /// Get the maximum shared memory per team for any kernel.
+  uint64_t getMaxSharedTeamMemory();
+
 private:
   /// Deinitialize the device (and plugin).
   void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..45bb74ec367d6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
@@ -273,10 +273,15 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
+typedef enum {
+    omp_access_cgroup = 0,
+} omp_access_t;
+
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 4576f9bd06121..1ed4192157fc8 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
   return HostDevice;
 }
 
+EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
+                                         omp_access_t AccessGroup) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+  if (DeviceNum == omp_get_initial_device())
+    return 0;
+
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  return DeviceOrErr->getMaxSharedTeamMemory();
+}
+
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
                          ";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..31bfc7d092424 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
     return false;
   return RTL->use_auto_zero_copy(RTLDeviceID);
 }
+
+uint64_t DeviceTy::getMaxSharedTeamMemory() {
+  using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
+  return RTL->query_device_info(
+      RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2406776c1fb5f..b5a1401564d58 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,7 @@ VERS1.0 {
     omp_get_num_devices;
     omp_get_device_num;
     omp_get_initial_device;
+    omp_get_groupprivate_limit;
     omp_target_alloc;
     omp_target_free;
     omp_target_is_present;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e35896..2cf156e576c5f 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,6 +52,7 @@ typedef enum {
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
   HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+  HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
 } hsa_amd_memory_pool_info_t;
 
 typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..fa373c2029f0c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {
 
     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
       return Err;
-
     return Plugin::success();
   }
 
@@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         return Err;
     }
 
+    StaticBlockMemSize = GroupSize;
+
     // Make sure it is a kernel symbol.
     if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
       return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
 
   /// Launch the AMDGPU kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
-                   KernelLaunchParamsTy LaunchParams,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
   /// Print more elaborate kernel launch info for AMDGPU
@@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = checkIfAPU())
       return Err;
 
+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        size_t Size = 0;
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+          return Err;
+        MaxBlockSharedMemSize = Size;
+        break;
+      }
+    }
+
+    // Supports block shared memory natively.
+    HasNativeBlockSharedMem = true;
+
     return Plugin::success();
   }
 
@@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     KernelArgsTy KernelArgs = {};
     uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
     if (auto Err = AMDGPUKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
             KernelLaunchParamsTy{}, AsyncInfoWrapper))
       return Err;
 
@@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                                 uint32_t DynBlockMemSize,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
     return Err;
 
-  // Account for user requested dynamic shared memory.
-  uint32_t GroupSize = getGroupSize();
-  if (uint32_t MaxDynCGroupMem = std::max(
-          KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
-    GroupSize += MaxDynCGroupMem;
-  }
-
   uint64_t StackSize;
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
@@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  getStaticBlockMemSize() + DynBlockMemSize,
+                                  StackSize, ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..3357ccfe0c9b5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -226,6 +226,10 @@ struct InfoTreeNode {
   }
 };
 
+enum class DeviceQueryKind {
+  DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
+};
+
 /// Class wrapping a __tgt_device_image and its offload entry table on a
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
@@ -312,13 +316,16 @@ struct GenericKernelTy {
                AsyncInfoWrapperTy &AsyncInfoWrapper) const;
   virtual Error launchImpl(GenericDeviceTy &GenericDevice,
                            uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                           KernelArgsTy &KernelArgs,
+                           uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
+  /// Get the size of the static per-block memory consumed by the kernel.
+  uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
   /// Get the kernel image.
   DeviceImageTy &getImage() const {
     assert(ImagePtr && "Kernel is not initialized!");
@@ -331,9 +338,9 @@ struct GenericKernelTy {
   }
 
   /// Return a device pointer to a new kernel launch environment.
-  Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
-                             AsyncInfoWrapperTy &AsyncInfo) const;
+  Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
+      GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+      void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
 
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -425,6 +432,9 @@ struct GenericKernelTy {
   /// The maximum number of threads which the kernel could leverage.
   uint32_t MaxNumThreads;
 
+  /// The static memory sized per block.
+  uint32_t StaticBlockMemSize = 0;
+
   /// The kernel environment, including execution flags.
   KernelEnvironmentTy KernelEnvironment;
 
@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the total shared memory per block that can be used in any kernel.
+  uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
+  /// Indicate whether the device has native block shared memory.
+  bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   std::atomic<bool> OmptInitialized;
 #endif
 
+  /// The total per-block shared memory that a kernel may use.
+  uint32_t MaxBlockSharedMemSize = 0;
+
+  /// Whether the device has native block shared memory.
+  bool HasNativeBlockSharedMem = false;
+
 private:
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
@@ -1347,6 +1369,9 @@ struct GenericPluginTy {
   /// Prints information about the given devices supported by the plugin.
   void print_device_info(int32_t DeviceId);
 
+  /// Retrieve information about the given device.
+  int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
+
   /// Creates an event in the given plugin if supported.
   int32_t create_event(int32_t DeviceId, void **EventPtr);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..2997585e1660f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice, uint32_t Version,
-    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+    GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+    void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
   // environment. Older versions of the compiler do not generate a kernel
   // launch environment.
   if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
-      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
     return nullptr;
 
-  if (!KernelEnvironment.Configuration.ReductionDataSize ||
-      !KernelEnvironment.Configuration.ReductionBufferLength)
+  if ((!KernelEnvironment.Configuration.ReductionDataSize ||
+       !KernelEnvironment.Configuration.ReductionBufferLength) &&
+      KernelArgs.DynCGroupMem == 0)
     return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
 
-  // TODO: Check if the kernel needs a launch environment.
   auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
                                             /*HostPtr=*/nullptr,
                                             TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -504,7 +504,9 @@ GenericKernelTy::getKernelLaunchEnvironment(
   /// async data transfer.
   auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
   LocalKLE = KernelLaunchEnvironment;
-  {
+
+  if (KernelEnvironment.Configuration.ReductionDataSize &&
+      KernelEnvironment.Configuration.ReductionBufferLength) {
     auto AllocOrErr = GenericDevice.dataAlloc(
         KernelEnvironment.Configuration.ReductionDataSize *
             KernelEnvironment.Configuration.ReductionBufferLength,
@@ -514,8 +516,13 @@ GenericKernelTy::getKernelLaunchEnvironment(
     LocalKLE.ReductionBuffer = *AllocOrErr;
     // Remember to free the memory later.
     AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+  } else {
+    LocalKLE.ReductionBuffer = nullptr;
   }
 
+  LocalKLE.DynCGroupMemSize = KernelArgs.DynCGroupMem;
+  LocalKLE.DynCGroupMemFallback = FallbackBlockMem;
+
   INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
        "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
        ", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
@@ -556,8 +563,45 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   llvm::SmallVector<void *, 16> Args;
   llvm::SmallVector<void *, 16> Ptrs;
 
+  uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
+                            KernelArgs.ThreadLimit[1],
+                            KernelArgs.ThreadLimit[2]};
+  uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
+                           KernelArgs.NumTeams[2]};
+  if (!isBareMode()) {
+    NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+    NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+                                NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+  }
+
+  uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
+  uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
+  uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
+  if (StaticBlockMemSize > MaxBlockMemSize)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Static block memory size exceeds maximum");
+  else if (!KernelArgs.Flags.AllowDynCGroupMemFallback &&
+           TotalBlockMemSize > MaxBlockMemSize)
+    return Plugin::error(
+        ErrorCode::INVALID_ARGUMENT,
+        "Static and dynamic block memory size exceeds maximum");
+
+  void *FallbackBlockMem = nullptr;
+  if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
+                          TotalBlockMemSize > MaxBlockMemSize)) {
+    auto AllocOrErr = GenericDevice.dataAlloc(
+        NumBlocks[0] * DynBlockMemSize,
+        /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+    if (!AllocOrErr)
+      return AllocOrErr.takeError();
+
+    FallbackBlockMem = *AllocOrErr;
+    AsyncInfoWrapper.freeAllocationAfterSynchronization(FallbackBlockMem);
+    DynBlockMemSize = 0;
+  }
+
   auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
-      GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
+      GenericDevice, KernelArgs, FallbackBlockMem, AsyncInfoWrapper);
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
@@ -573,17 +617,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                     Args, Ptrs, *KernelLaunchEnvOrErr);
   }
 
-  uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
-                            KernelArgs.ThreadLimit[1],
-                            KernelArgs.ThreadLimit[2]};
-  uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
-                           KernelArgs.NumTeams[2]};
-  if (!isBareMode()) {
-    NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
-    NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
-                                NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
-  }
-
   // Record the kernel description after we modified the argument count and num
   // blocks/threads.
   RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -599,8 +632,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
           printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
     return Err;
 
-  return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
-                    LaunchParams, AsyncInfoWrapper);
+  return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemSize,
+                    KernelArgs, LaunchParams, AsyncInfoWrapper);
 }
 
 KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -2077,6 +2110,17 @@ void GenericPluginTy::print_device_info(int32_t DeviceId) {
            toString(std::move(Err)).data());
 }
 
+int64_t GenericPluginTy::query_device_info(int32_t DeviceId,
+                                           DeviceQueryKind Query) {
+  const GenericDeviceTy &Device = getDevice(DeviceId);
+
+  switch (Query) {
+  case DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM:
+    return Device.getMaxBlockSharedMemSize();
+  }
+  return 0;
+}
+
 int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
   auto Err = getDevice(DeviceId).createEvent(EventPtr);
   if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8b..b6e087edea876 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -258,6 +258,7 @@ typedef enum CUdevice_attribute_enum {
 
 typedef enum CUfunction_attribute_enum {
   CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
   CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
 } CUfunction_attribute;
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 15193de6ae430..eda7a85f750f0 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -148,13 +148,21 @@ struct CUDAKernelTy : public GenericKernelTy {
     // The maximum number of threads cannot exceed the maximum of the kernel.
     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 
+    int SharedMemSize;
+    Res = cuFuncGetAttribute(&SharedMemSize,
+                             CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
+    if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
+      return Err;
+
+    StaticBlockMemSize = SharedMemSize;
+
     return Plugin::success();
   }
 
   /// Launch the CUDA kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
-                   KernelLaunchParamsTy LaunchParams,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
 private:
@@ -162,7 +170,7 @@ struct CUDAKernelTy : public GenericKernelTy {
   CUfunction Func;
   /// The maximum amount of dynamic shared memory per thread group. By default,
   /// this is set to 48 KB.
-  mutable uint32_t MaxDynCGroupMemLimit = 49152;
+  mutable uint32_t MaxDynBlockMemSize = 49152;
 };
 
 /// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -358,6 +366,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Err;
     HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
 
+    uint32_t MaxSharedMem;
+    if (auto Err = getDeviceAttr(
+            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+      return Err;
+    MaxBlockSharedMemSize = MaxSharedMem;
+
+    // Supports block shared memory natively.
+    HasNativeBlockSharedMem = true;
+
     return Plugin::success();
   }
 
@@ -1239,7 +1256,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     KernelArgsTy KernelArgs = {};
     uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
     if (auto Err = CUDAKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
             KernelLaunchParamsTy{}, AsyncInfoWrapper))
       return Err;
 
@@ -1285,6 +1302,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                               uint32_t DynBlockMemSize,
                                KernelArgsTy &KernelArgs,
                                KernelLaunchParamsTy LaunchParams,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1294,9 +1312,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  uint32_t MaxDynCGroupMem =
-      std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
-
   void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
                     reinterpret_cast<void *>(&LaunchParams.Size),
@@ -1308,18 +1323,18 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
     GenericDevice.Plugin.getRPCServer().Thread->notify();
 
   // In case we require more memory than the current limit.
-  if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+  if (DynBlockMemSize >= MaxDynBlockMemSize) {
     CUresult AttrResult = cuFuncSetAttribute(
-        Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+        Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
     Plugin::check(
         AttrResult,
         "Error in cuLaunchKernel while setting the memory limits: %s");
-    MaxDynCGroupMemLimit = MaxDynCGroupMem;
+    MaxDynBlockMemSize = DynBlockMemSize;
   }
 
   CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
                                 NumThreads[0], NumThreads[1], NumThreads[2],
-                                MaxDynCGroupMem, Stream, nullptr, Config);
+                                DynBlockMemSize, Stream, nullptr, Config);
 
   // Register a callback to indicate when the kernel is complete.
   if (GenericDevice.getRPCServer())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572265b4c..dc82a2ef16e51 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -92,8 +92,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
 
   /// Launch the kernel using the libffi.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
-                   KernelLaunchParamsTy LaunchParams,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
     // Create a vector of ffi_types, one per argument.
     SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
diff --git a/offload/test/offloading/dyn_groupprivate_strict.cpp b/offload/test/offloading/dyn_groupprivate_strict.cpp
new file mode 100644
index 0000000000000..a35f8dd2b0595
--- /dev/null
+++ b/offload/test/offloading/dyn_groupprivate_strict.cpp
@@ -0,0 +1,141 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: gpu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 512
+
+int main() {
+  int Result[N], NumThreads;
+
+#pragma omp target teams num_teams(1) thread_limit(N)                          \
+    dyn_groupprivate(strict : N * sizeof(Result[0]))                           \
+    map(from : Result, NumThreads)
+  {
+    int Buffer[N];
+#pragma omp parallel
+    {
+      int *DynBuffer = (int *)omp_get_dyn_groupprivate_ptr();
+      int TId = omp_get_thread_num();
+      if (TId == 0)
+        NumThreads = omp_get_num_threads();
+      Buffer[TId] = 7;
+      DynBuffer[TId] = 3;
+#pragma omp barrier
+      int WrappedTId = (TId + 37) % NumThreads;
+      Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
+    }
+  }
+
+  if (NumThreads < N / 2 || NumThreads > N) {
+    printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
+           NumThreads);
+    return -1;
+  }
+
+  int Failed = 0;
+  for (int i = 0; i < NumThreads; ++i) {
+    if (Result[i] != 7 + 3) {
+      printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
+      ++Failed;
+    }
+  }
+
+  // Verify that the routines in the host returns NULL and zero.
+  if (omp_get_dyn_groupprivate_ptr())
+    ++Failed;
+  if (omp_get_dyn_groupprivate_size())
+    ++Failed;
+
+  size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup);
+  size_t ExceededSize = MaxSize + 10;
+
+// Verify that the fallback modifier works.
+#pragma omp target dyn_groupprivate(fallback : ExceededSize)                   \
+    map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (!omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size() != ExceededSize)
+      ++Failed;
+    if (!IsFallback)
+      ++Failed;
+  }
+
+// Verify that the default modifier is fallback.
+#pragma omp target dyn_groupprivate(ExceededSize)
+  {
+  }
+
+// Verify that the strict modifier works.
+#pragma omp target dyn_groupprivate(strict : N) map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (!omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size() != N)
+      ++Failed;
+    if (IsFallback)
+      ++Failed;
+  }
+
+// Verify that the fallback does not trigger when not needed.
+#pragma omp target dyn_groupprivate(fallback : N) map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (!omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size() != N)
+      ++Failed;
+    if (IsFallback)
+      ++Failed;
+  }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(strict : 0) map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (IsFallback)
+      ++Failed;
+  }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(fallback : 0) map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (IsFallback)
+      ++Failed;
+  }
+
+// Verify that omitting the clause is the same as setting zero size.
+#pragma omp target map(tofrom : Failed)
+  {
+    int IsFallback;
+    if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+      ++Failed;
+    if (omp_get_dyn_groupprivate_size())
+      ++Failed;
+    if (IsFallback)
+      ++Failed;
+  }
+
+  // CHECK: PASS
+  if (!Failed)
+    printf("PASS\n");
+}
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 74f385feb3ea5..26c3df56a9ce3 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -380,6 +380,10 @@
         omp_uintptr_t value;
     } omp_alloctrait_t;
 
+    typedef enum {
+        omp_access_cgroup = 0,
+    } omp_access_t;
+
 #   if defined(_WIN32)
     // On Windows cl and icl do not support 64-bit enum, let's use integer then.
     typedef omp_uintptr_t omp_allocator_handle_t;
@@ -463,6 +467,9 @@
                                                    omp_allocator_handle_t allocator = omp_null_allocator,
                                                    omp_allocator_handle_t free_allocator = omp_null_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset = 0, int *is_fallback = NULL, omp_access_t access_group = omp_access_cgroup);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group = omp_access_cgroup);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
 #   else
     extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
     extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +480,9 @@
     extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
                                                    omp_allocator_handle_t free_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group);
 #   endif
 
     /* OpenMP TR11 routines to get memory spaces and allocators */
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 3ca32ba583fe2..9605bad457e11 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,6 +4515,15 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
 }
 /* end of OpenMP 5.1 Memory Management routines */
 
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+                                   omp_access_t access_group) {
+  if (is_fallback != NULL)
+    *is_fallback = 0;
+  return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { return 0; }
+
 int __kmpc_get_target_offload(void) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 06276d1bed1c7..a099f887b6ba4 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -454,6 +454,22 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
 #endif
 }
 
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+                                   omp_access_t access_group) {
+  i;
+  return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) {
+  i;
+  return 0;
+}
+
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group) {
+  i;
+  return 0;
+}
+
 /* OpenMP 5.0 Affinity Format */
 void omp_set_affinity_format(char const *format) { i; }
 size_t omp_get_affinity_format(char *buffer, size_t size) {

>From f20f4ba2290c0966e86dae733ad025d2fb0995d2 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 9 Aug 2025 22:50:23 -0700
Subject: [PATCH 2/2] Add fixes

---
 offload/DeviceRTL/include/DeviceTypes.h    |  5 +++++
 offload/DeviceRTL/src/State.cpp            | 19 ++++++++++---------
 offload/include/omptarget.h                | 15 +++++++++++----
 offload/plugins-nextgen/amdgpu/src/rtl.cpp |  4 ++++
 offload/plugins-nextgen/cuda/src/rtl.cpp   |  4 ++++
 openmp/runtime/src/kmp_csupport.cpp        |  2 +-
 6 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index a43b506d6879e..042fef45917b0 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,8 +163,13 @@ typedef enum omp_allocator_handle_t {
 
 ///}
 
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
 enum omp_access_t {
+  /// Groups the tasks based on the contention group to which they belong.
   omp_access_cgroup = 0,
+  /// Groups the tasks based on the parallel region to which they bind.
+  omp_access_pteam = 1,
 };
 
 #endif
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 9e2a9999167b4..c6bc6a140f5f2 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -163,14 +163,15 @@ struct DynCGroupMemTy {
     Size = 0;
     Ptr = nullptr;
     IsFallback = false;
-    if (KLE) {
-      Size = KLE->DynCGroupMemSize;
-      if (void *Fallback = KLE->DynCGroupMemFallback) {
-        Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
-        IsFallback = true;
-      } else {
-        Ptr = static_cast<char *>(NativeDynCGroup);
-      }
+    if (!KLE)
+      return;
+
+    Size = KLE->DynCGroupMemSize;
+    if (void *Fallback = KLE->DynCGroupMemFallback) {
+      Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+      IsFallback = true;
+    } else {
+      Ptr = static_cast<char *>(NativeDynCGroup);
     }
   }
 
@@ -466,7 +467,7 @@ int omp_is_initial_device(void) { return 0; }
 
 void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
                                    omp_access_t) {
-  if (IsFallback != NULL)
+  if (IsFallback != nullptr)
     *IsFallback = DynCGroupMem.isFallback();
   return DynCGroupMem.getPtr(Offset);
 }
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 45bb74ec367d6..ddb0f7f88d2e0 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -273,15 +273,22 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
-typedef enum {
-    omp_access_cgroup = 0,
-} omp_access_t;
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
+enum omp_access_t {
+  /// Groups the tasks based on the contention group to which they belong.
+  omp_access_cgroup = 0,
+  /// Groups the tasks based on the parallel region to which they bind.
+  omp_access_pteam = 1,
+};
 
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
-size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
+size_t
+omp_get_groupprivate_limit(int device_num,
+                           omp_access_t access_group = omp_access_cgroup);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index fa373c2029f0c..9751169b09c60 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3441,6 +3441,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                            KernelArgs.DynCGroupMem);
   }
 
+  // Increase to the requested dynamic memory size for the device if needed.
+  DynBlockMemSize =
+      std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
                                   getStaticBlockMemSize() + DynBlockMemSize,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index eda7a85f750f0..b052197e2aa6a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1322,6 +1322,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (GenericDevice.getRPCServer())
     GenericDevice.Plugin.getRPCServer().Thread->notify();
 
+  // Increase to the requested dynamic memory size for the device if needed.
+  DynBlockMemSize =
+      std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
   // In case we require more memory than the current limit.
   if (DynBlockMemSize >= MaxDynBlockMemSize) {
     CUresult AttrResult = cuFuncSetAttribute(
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 9605bad457e11..3ac62e5893f8b 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4517,7 +4517,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
 
 void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
                                    omp_access_t access_group) {
-  if (is_fallback != NULL)
+  if (is_fallback != nullptr)
     *is_fallback = 0;
   return NULL;
 }