[llvm-branch-commits] [llvm] [openmp] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (PR #152831)

Fri Aug 8 22:48:57 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-offload

Author: Kevin Sala Penades (kevinsala)

<details>
<summary>Changes</summary>

Part 2 adding offload runtime support. See https://github.com/llvm/llvm-project/pull/152651.

---

Patch is 39.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152831.diff


23 Files Affected:

- (modified) offload/DeviceRTL/include/DeviceTypes.h (+4) 
- (modified) offload/DeviceRTL/include/Interface.h (+1-1) 
- (modified) offload/DeviceRTL/include/State.h (+1-1) 
- (modified) offload/DeviceRTL/src/Kernel.cpp (+7-7) 
- (modified) offload/DeviceRTL/src/State.cpp (+46-2) 
- (modified) offload/include/Shared/APITypes.h (+4-2) 
- (modified) offload/include/Shared/Environment.h (+3-1) 
- (modified) offload/include/device.h (+3) 
- (modified) offload/include/omptarget.h (+6-1) 
- (modified) offload/libomptarget/OpenMP/API.cpp (+14) 
- (modified) offload/libomptarget/device.cpp (+6) 
- (modified) offload/libomptarget/exports (+1) 
- (modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+1) 
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+22-12) 
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+29-4) 
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+65-21) 
- (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+1) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+26-11) 
- (modified) offload/plugins-nextgen/host/src/rtl.cpp (+2-2) 
- (added) offload/test/offloading/dyn_groupprivate_strict.cpp (+141) 
- (modified) openmp/runtime/src/include/omp.h.var (+10) 
- (modified) openmp/runtime/src/kmp_csupport.cpp (+9) 
- (modified) openmp/runtime/src/kmp_stub.cpp (+16) 


``````````diff

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d92380f040..a43b506d6879e 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t {
 
 ///}
 
+enum omp_access_t {
+  omp_access_cgroup = 0,
+};
+
 #endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4..672afea206785 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
 int8_t __kmpc_is_spmd_exec_mode();
 
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 void __kmpc_target_deinit();
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index db396dae6e445..17c3c6f2d3e42 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+          KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 /// Return the kernel and kernel launch environment associated with the current
 /// kernel. The former is static and contains compile time information that
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..58e9a09105a76 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
 };
 
 static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+                  KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
 /// \param Ident               Source location identification, can be NULL.
 ///
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
   bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
   bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
   if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+                      KernelLaunchEnvironment);
     synchronize::threadsAligned(atomic::relaxed);
   } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+                      KernelLaunchEnvironment);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 62b03e7bba720..9e2a9999167b4 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 }
 
+struct DynCGroupMemTy {
+  void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+    Size = 0;
+    Ptr = nullptr;
+    IsFallback = false;
+    if (KLE) {
+      Size = KLE->DynCGroupMemSize;
+      if (void *Fallback = KLE->DynCGroupMemFallback) {
+        Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+        IsFallback = true;
+      } else {
+        Ptr = static_cast<char *>(NativeDynCGroup);
+      }
+    }
+  }
+
+  char *getPtr(size_t Offset) const { return Ptr + Offset; }
+  bool isFallback() const { return IsFallback; }
+  size_t getSize() const { return Size; }
+
+private:
+  char *Ptr;
+  size_t Size;
+  bool IsFallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
 } // namespace
 
 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 } // namespace
 
 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                 KernelLaunchEnvironmentTy *KLE) {
   SharedMemorySmartStack.init(IsSPMD);
+
+  if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+    KLE = nullptr;
+
   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+    DynCGroupMem.init(KLE, DynamicSharedBuffer);
     TeamState.init(IsSPMD);
     ThreadStates = nullptr;
     KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+    KernelLaunchEnvironmentPtr = KLE;
   }
 }
 
@@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 int omp_get_initial_device(void) { return -1; }
 
 int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
+                                   omp_access_t) {
+  if (IsFallback != NULL)
+    *IsFallback = DynCGroupMem.isFallback();
+  return DynCGroupMem.getPtr(Offset);
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+  return DynCGroupMem.getSize();
+}
 }
 
 extern "C" {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..0ef2dd162292b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -97,8 +97,10 @@ struct KernelArgsTy {
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
     uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
-    uint64_t Unused : 62;
-  } Flags = {0, 0, 0};
+    uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
+                                            // mem fallback.
+    uint64_t Unused : 61;
+  } Flags = {0, 0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
   // The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..0670ac1090da4 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
 };
 
 struct KernelLaunchEnvironmentTy {
+  void *ReductionBuffer = nullptr;
+  void *DynCGroupMemFallback = nullptr;
   uint32_t ReductionCnt = 0;
   uint32_t ReductionIterCnt = 0;
-  void *ReductionBuffer = nullptr;
+  uint32_t DynCGroupMemSize = 0;
 };
 
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..0e93cf8ec1a8b 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
   /// Indicate that there are pending images for this device or not.
   void setHasPendingImages(bool V) { HasPendingImages = V; }
 
+  /// Get the maximum shared memory per team for any kernel.
+  uint64_t getMaxSharedTeamMemory();
+
 private:
   /// Deinitialize the device (and plugin).
   void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..45bb74ec367d6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
@@ -273,10 +273,15 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
+typedef enum {
+    omp_access_cgroup = 0,
+} omp_access_t;
+
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 4576f9bd06121..1ed4192157fc8 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
   return HostDevice;
 }
 
+EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
+                                         omp_access_t AccessGroup) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+  if (DeviceNum == omp_get_initial_device())
+    return 0;
+
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  return DeviceOrErr->getMaxSharedTeamMemory();
+}
+
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
                          ";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..31bfc7d092424 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
     return false;
   return RTL->use_auto_zero_copy(RTLDeviceID);
 }
+
+uint64_t DeviceTy::getMaxSharedTeamMemory() {
+  using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
+  return RTL->query_device_info(
+      RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2406776c1fb5f..b5a1401564d58 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,7 @@ VERS1.0 {
     omp_get_num_devices;
     omp_get_device_num;
     omp_get_initial_device;
+    omp_get_groupprivate_limit;
     omp_target_alloc;
     omp_target_free;
     omp_target_is_present;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e35896..2cf156e576c5f 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,6 +52,7 @@ typedef enum {
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
   HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+  HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
 } hsa_amd_memory_pool_info_t;
 
 typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..fa373c2029f0c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {
 
     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
       return Err;
-
     return Plugin::success();
   }
 
@@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         return Err;
     }
 
+    StaticBlockMemSize = GroupSize;
+
     // Make sure it is a kernel symbol.
     if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
       return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
 
   /// Launch the AMDGPU kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
-                   KernelLaunchParamsTy LaunchParams,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
   /// Print more elaborate kernel launch info for AMDGPU
@@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = checkIfAPU())
       return Err;
 
+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        size_t Size = 0;
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+          return Err;
+        MaxBlockSharedMemSize = Size;
+        break;
+      }
+    }
+
+    // Supports block shared memory natively.
+    HasNativeBlockSharedMem = true;
+
     return Plugin::success();
   }
 
@@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     KernelArgsTy KernelArgs = {};
     uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
     if (auto Err = AMDGPUKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
             KernelLaunchParamsTy{}, AsyncInfoWrapper))
       return Err;
 
@@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                                 uint32_t DynBlockMemSize,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
     return Err;
 
-  // Account for user requested dynamic shared memory.
-  uint32_t GroupSize = getGroupSize();
-  if (uint32_t MaxDynCGroupMem = std::max(
-          KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
-    GroupSize += MaxDynCGroupMem;
-  }
-
   uint64_t StackSize;
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
@@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  getStaticBlockMemSize() + DynBlockMemSize,
+                                  StackSize, ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..3357ccfe0c9b5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -226,6 +226,10 @@ struct InfoTreeNode {
   }
 };
 
+enum class DeviceQueryKind {
+  DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
+};
+
 /// Class wrapping a __tgt_device_image and its offload entry table on a
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
@@ -312,13 +316,16 @@ struct GenericKernelTy {
                AsyncInfoWrapperTy &AsyncInfoWrapper) const;
   virtual Error launchImpl(GenericDeviceTy &GenericDevice,
                            uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                           KernelArgsTy &KernelArgs,
+                           uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
+  /// Get the size of the static per-block memory consumed by the kernel.
+  uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
   /// Get the kernel image.
   DeviceImageTy &getImage() const {
     assert(ImagePtr && "Kernel is not initialized!");
@@ -331,9 +338,9 @@ struct GenericKernelTy {
   }
 
   /// Return a device pointer to a new kernel launch environment.
-  Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
-                             AsyncInfoWrapperTy &AsyncInfo) const;
+  Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
+      GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+      void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
 
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -425,6 +432,9 @@ struct GenericKernelTy {
   /// The maximum number of threads which the kernel could leverage.
   uint32_t MaxNumThreads;
 
+  /// The static memory sized per block.
+  uint32_t StaticBlockMemSize = 0;
+
   /// The kernel environment, including execution flags.
   KernelEnvironmentTy KernelEnvironment;
 
@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the total shared memory per block that can be used in any kernel.
+  uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
+  /// Indicate whether the device has native block shared memory.
+  bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   std::atomic<bool> OmptInitialized;
 #endif
 
+  /// The total per-block shared memory that a kernel may use.
+  uint32_t MaxBlockSharedMemSize = 0;
+
+  /// Whether the device has native block shared memory.
+  bool HasNativeBlockSharedMem = false;
+
 private:
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
@@ -1347,6 +1369,9 @@ struct GenericPluginTy {
   /// Prints information about the given devices supported by the plugin.
   void print_device_info(int32_t DeviceId);
 
+  /// Retrieve information about the given device.
+  int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
+
   /// Creates an event in the given plugin if supported.
   int32_t create_event(int32_t DeviceId, void **EventPtr);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..2997585e1660f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice, uint32_t Version,
-    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+    GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+    void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
   // environment. Older versions of the compiler do not generate a kernel
   // launch environment.
   if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
-      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
     return nullptr;
 
-  if (!KernelEnvironment....
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/152831