[llvm] [Offload] Add oneInterationPerThread param to loop device RTL (PR #151959)
Dominik Adamski via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 06:25:34 PDT 2025
https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/151959
>From 62bd53a6d4e7399415503c1ecdf2efbc643ee036 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 4 Aug 2025 07:19:08 -0500
Subject: [PATCH 1/2] [Offload] Add oneInterationPerThread param to loop device
RTL
Currently, Flang can generate no-loop kernels for all OpenMP
kernels in the program if the flags
-fopenmp-assume-teams-oversubscription or
-fopenmp-assume-threads-oversubscription are set.
If we add an additional parameter, we can choose
in the future which OpenMP kernels should be generated
as no-loop kernels.
This PR doesn't modify current behaviour of oversubscription
flags.
---
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 36 ++++++++--------
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1 +
offload/DeviceRTL/src/Workshare.cpp | 43 +++++++++++--------
3 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index f974cfc78c8dd..85f42dc9db60e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -470,18 +470,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
-__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
@@ -674,22 +674,22 @@ __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
ParamAttrs(ReadOnlyPtrAttrs, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- SExt, SExt, SExt, SExt))
+ SExt, SExt, SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- ZExt, ZExt, ZExt, ZExt))
+ ZExt, ZExt, ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- SExt, SExt))
+ SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- ZExt, ZExt))
+ ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- SExt, SExt, SExt))
+ SExt, SExt, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
- ZExt, ZExt, ZExt))
+ ZExt, ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
ParamAttrs(AttributeSet(), SExt))
__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3aa4f7ae04c33..b5feee34d884d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4565,6 +4565,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
}
+ RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
Builder.CreateCall(RTLFn, RealArgs);
}
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index a8759307b42bd..c5791ca3f11c3 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -768,7 +768,8 @@ template <typename Ty> class StaticLoopChunker {
public:
/// Worksharing `for`-loop.
static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+ Ty NumIters, Ty NumThreads, Ty ThreadChunk,
+ Ty OneIterationPerThread) {
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(ThreadChunk >= 0, "Bad thread count");
@@ -790,12 +791,13 @@ template <typename Ty> class StaticLoopChunker {
// If we know we have more threads than iterations we can indicate that to
// avoid an outer loop.
- bool OneIterationPerThread = false;
if (config::getAssumeThreadsOversubscription()) {
- ASSERT(NumThreads >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}
+ if (OneIterationPerThread)
+ ASSERT(NumThreads >= NumIters, "Broken assumption");
+
if (ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
@@ -807,7 +809,7 @@ template <typename Ty> class StaticLoopChunker {
/// Worksharing `distribute`-loop.
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty BlockChunk) {
+ Ty NumIters, Ty BlockChunk, Ty OneIterationPerThread) {
ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -831,12 +833,13 @@ template <typename Ty> class StaticLoopChunker {
// If we know we have more blocks than iterations we can indicate that to
// avoid an outer loop.
- bool OneIterationPerThread = false;
if (config::getAssumeTeamsOversubscription()) {
- ASSERT(NumBlocks >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}
+ if (OneIterationPerThread)
+ ASSERT(NumBlocks >= NumIters, "Broken assumption");
+
if (BlockChunk != NumThreads)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
@@ -854,7 +857,8 @@ template <typename Ty> class StaticLoopChunker {
/// Worksharing `distribute parallel for`-loop.
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
void *Arg, Ty NumIters, Ty NumThreads,
- Ty BlockChunk, Ty ThreadChunk) {
+ Ty BlockChunk, Ty ThreadChunk,
+ Ty OneIterationPerThread) {
ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -882,13 +886,14 @@ template <typename Ty> class StaticLoopChunker {
// If we know we have more threads (across all blocks) than iterations we
// can indicate that to avoid an outer loop.
- bool OneIterationPerThread = false;
if (config::getAssumeTeamsOversubscription() &
config::getAssumeThreadsOversubscription()) {
OneIterationPerThread = true;
- ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
}
+ if (OneIterationPerThread)
+ ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+
if (BlockChunk != NumThreads || ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
@@ -909,22 +914,24 @@ template <typename Ty> class StaticLoopChunker {
[[gnu::flatten, clang::always_inline]] void \
__kmpc_distribute_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY block_chunk, TY thread_chunk) { \
+ TY num_threads, TY block_chunk, TY thread_chunk, \
+ TY one_iteration_per_thread) { \
ompx::StaticLoopChunker<TY>::DistributeFor( \
- loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \
+ loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \
+ one_iteration_per_thread); \
} \
[[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
- void *arg, TY num_iters, \
- TY block_chunk) { \
- ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \
- block_chunk); \
+ __kmpc_distribute_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY block_chunk, TY one_iteration_per_thread) { \
+ ompx::StaticLoopChunker<TY>::Distribute( \
+ loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \
} \
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY thread_chunk) { \
+ TY num_threads, TY thread_chunk, TY one_iteration_per_thread) { \
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
- thread_chunk); \
+ thread_chunk, one_iteration_per_thread); \
}
extern "C" {
>From 9e8d671c84255b0262a34821d53f6987450f77ea Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 4 Aug 2025 08:25:16 -0500
Subject: [PATCH 2/2] Fixed format
---
offload/DeviceRTL/src/Workshare.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index c5791ca3f11c3..a5f3ef2f3d940 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -912,18 +912,18 @@ template <typename Ty> class StaticLoopChunker {
#define OMP_LOOP_ENTRY(BW, TY) \
[[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_for_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY block_chunk, TY thread_chunk, \
- TY one_iteration_per_thread) { \
+ __kmpc_distribute_for_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY num_threads, TY block_chunk, TY thread_chunk, \
+ TY one_iteration_per_thread) { \
ompx::StaticLoopChunker<TY>::DistributeFor( \
loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \
one_iteration_per_thread); \
} \
[[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY block_chunk, TY one_iteration_per_thread) { \
+ __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
+ void *arg, TY num_iters, TY block_chunk, \
+ TY one_iteration_per_thread) { \
ompx::StaticLoopChunker<TY>::Distribute( \
loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \
} \
More information about the llvm-commits
mailing list