[llvm] [OpenMP][Offload] Add SPMD-No-Loop mode to OpenMP offload runtime (PR #154105)

Mon Aug 18 05:27:45 PDT 2025

https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/154105

>From a991f195e8e7f37a8cc0f20912da9aed09a37dd1 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 18 Aug 2025 05:55:28 -0500
Subject: [PATCH 1/2] [OpenMP] Add SPMD-No-Loop mode to OpenMP offload runtime

Kernels which are marked as SPMD-No-Loop should be launched
with sufficient number of teams and threads to cover loop
iteration space.

No-Loop mode is described in RFC:
https://discourse.llvm.org/t/rfc-no-loop-mode-for-openmp-gpu-kernels/87517/
---
 .../llvm/Frontend/OpenMP/OMPDeviceConstants.h |  3 ++-
 .../common/include/PluginInterface.h          | 12 +++++++++-
 .../common/src/PluginInterface.cpp            | 22 +++++++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
index 3ae447b14f320..c41b4d1e9844c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -23,7 +23,8 @@ enum OMPTgtExecModeFlags : unsigned char {
   OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
   OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
   OMP_TGT_EXEC_MODE_GENERIC_SPMD =
-      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+  OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
 };
 
 } // end namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index a448721755a6f..47e72147b1cc3 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -431,6 +431,8 @@ struct GenericKernelTy {
       return "Generic";
     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
       return "Generic-SPMD";
+    case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+      return "SPMD-No-Loop";
     }
     llvm_unreachable("Unknown execution mode!");
   }
@@ -468,7 +470,8 @@ struct GenericKernelTy {
                         uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
                         uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
 
-  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+  /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+  /// or SPMD mode.
   bool isGenericSPMDMode() const {
     return KernelEnvironment.Configuration.ExecMode ==
            OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -483,6 +486,10 @@ struct GenericKernelTy {
   bool isBareMode() const {
     return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
   }
+  bool isNoLoopMode() const {
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+  }
 
   /// The kernel name.
   std::string Name;
@@ -1152,6 +1159,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// deallocated by the allocator.
   llvm::SmallVector<DeviceImageTy *> LoadedImages;
 
+  /// Return value of OMP_TEAMS_THREAD_LIMIT environment variable
+  int32_t getOMPTeamsThreadLimit() const { return OMP_TeamsThreadLimit; }
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c06c35e1e6a5b..72d75010d9657 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -640,6 +640,18 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
   if (ThreadLimitClause[0] > 0 && isGenericMode())
     ThreadLimitClause[0] += GenericDevice.getWarpSize();
 
+  // Honor OMP_TEAMS_THREAD_LIMIT environment variable and
+  // num_threads/thread_limit clause for NoLoop kernel types.
+  int32_t TeamsThreadLimitEnvVar = GenericDevice.getOMPTeamsThreadLimit();
+  uint16_t ConstWGSize = GenericDevice.getDefaultNumThreads();
+  if (isNoLoopMode()) {
+    if (TeamsThreadLimitEnvVar > 0)
+      return std::min(static_cast<int32_t>(ConstWGSize),
+                      TeamsThreadLimitEnvVar);
+    if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != (uint32_t)-1))
+      return std::min(static_cast<uint32_t>(ConstWGSize), ThreadLimitClause[0]);
+    return ConstWGSize;
+  }
   return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
                                      ? ThreadLimitClause[0]
                                      : PreferredNumThreads);
@@ -662,6 +674,16 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
     return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
   }
 
+  const auto getNumGroupsFromThreadsAndTripCount =
+      [](const uint64_t TripCount, const uint32_t NumThreads) {
+        return ((TripCount - 1) / NumThreads) + 1;
+      };
+  if (isNoLoopMode()) {
+    return LoopTripCount > 0
+               ? getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads)
+               : 1;
+  }
+
   uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
   uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
   if (LoopTripCount > 0) {

>From 2beb3b499cd2ad266cbec19294c77da6323d387d Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 18 Aug 2025 07:27:02 -0500
Subject: [PATCH 2/2] Applied remarks

---
 offload/DeviceRTL/src/Kernel.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..8c2828b270419 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -30,7 +30,8 @@ enum OMPTgtExecModeFlags : unsigned char {
   OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
   OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
   OMP_TGT_EXEC_MODE_GENERIC_SPMD =
-      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+  OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
 };
 
 static void