[llvm] [OpenMP][Offload] Add SPMD-No-Loop mode to OpenMP offload runtime (PR #154105)
Dominik Adamski via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 18 05:27:45 PDT 2025
https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/154105
>From a991f195e8e7f37a8cc0f20912da9aed09a37dd1 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 18 Aug 2025 05:55:28 -0500
Subject: [PATCH 1/2] [OpenMP] Add SPMD-No-Loop mode to OpenMP offload runtime
Kernels which are marked as SPMD-No-Loop should be launched
with sufficient number of teams and threads to cover loop
iteration space.
No-Loop mode is described in RFC:
https://discourse.llvm.org/t/rfc-no-loop-mode-for-openmp-gpu-kernels/87517/
---
.../llvm/Frontend/OpenMP/OMPDeviceConstants.h | 3 ++-
.../common/include/PluginInterface.h | 12 +++++++++-
.../common/src/PluginInterface.cpp | 22 +++++++++++++++++++
3 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
index 3ae447b14f320..c41b4d1e9844c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -23,7 +23,8 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
OMP_TGT_EXEC_MODE_GENERIC_SPMD =
- OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+ OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
};
} // end namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index a448721755a6f..47e72147b1cc3 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -431,6 +431,8 @@ struct GenericKernelTy {
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+ return "SPMD-No-Loop";
}
llvm_unreachable("Unknown execution mode!");
}
@@ -468,7 +470,8 @@ struct GenericKernelTy {
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
- /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+ /// or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -483,6 +486,10 @@ struct GenericKernelTy {
bool isBareMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
}
+ bool isNoLoopMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ }
/// The kernel name.
std::string Name;
@@ -1152,6 +1159,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
+ /// Return value of OMP_TEAMS_THREAD_LIMIT environment variable
+ int32_t getOMPTeamsThreadLimit() const { return OMP_TeamsThreadLimit; }
+
private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c06c35e1e6a5b..72d75010d9657 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -640,6 +640,18 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
if (ThreadLimitClause[0] > 0 && isGenericMode())
ThreadLimitClause[0] += GenericDevice.getWarpSize();
+ // Honor OMP_TEAMS_THREAD_LIMIT environment variable and
+ // num_threads/thread_limit clause for NoLoop kernel types.
+ int32_t TeamsThreadLimitEnvVar = GenericDevice.getOMPTeamsThreadLimit();
+ uint16_t ConstWGSize = GenericDevice.getDefaultNumThreads();
+ if (isNoLoopMode()) {
+ if (TeamsThreadLimitEnvVar > 0)
+ return std::min(static_cast<int32_t>(ConstWGSize),
+ TeamsThreadLimitEnvVar);
+ if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != (uint32_t)-1))
+ return std::min(static_cast<uint32_t>(ConstWGSize), ThreadLimitClause[0]);
+ return ConstWGSize;
+ }
return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
? ThreadLimitClause[0]
: PreferredNumThreads);
@@ -662,6 +674,16 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ const auto getNumGroupsFromThreadsAndTripCount =
+ [](const uint64_t TripCount, const uint32_t NumThreads) {
+ return ((TripCount - 1) / NumThreads) + 1;
+ };
+ if (isNoLoopMode()) {
+ return LoopTripCount > 0
+ ? getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads)
+ : 1;
+ }
+
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
>From 2beb3b499cd2ad266cbec19294c77da6323d387d Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 18 Aug 2025 07:27:02 -0500
Subject: [PATCH 2/2] Applied remarks
---
offload/DeviceRTL/src/Kernel.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..8c2828b270419 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -30,7 +30,8 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
OMP_TGT_EXEC_MODE_GENERIC_SPMD =
- OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+ OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
};
static void
More information about the llvm-commits
mailing list