[llvm] [mlir] [Offload] Add oneInterationPerThread param to loop device RTL (PR #151959)

Mon Aug 11 05:16:17 PDT 2025

https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/151959

>From 62bd53a6d4e7399415503c1ecdf2efbc643ee036 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 4 Aug 2025 07:19:08 -0500
Subject: [PATCH 1/5] [Offload] Add oneInterationPerThread param to loop device
 RTL

Currently, Flang can generate no-loop kernels for all OpenMP
kernels in the program if the flags
-fopenmp-assume-teams-oversubscription or
-fopenmp-assume-threads-oversubscription are set.
If we add an additional parameter, we can choose
in the future which OpenMP kernels should be generated
as no-loop kernels.

This PR doesn't modify current behaviour of oversubscription
flags.
---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def | 36 ++++++++--------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  1 +
 offload/DeviceRTL/src/Workshare.cpp           | 43 +++++++++++--------
 3 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index f974cfc78c8dd..85f42dc9db60e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -470,18 +470,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
-__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
@@ -674,22 +674,22 @@ __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
                 ParamAttrs(ReadOnlyPtrAttrs, SExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt, SExt, SExt))
+                           SExt, SExt, SExt, SExt, SExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           ZExt, ZExt, ZExt, ZExt))
+                           ZExt, ZExt, ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt))
+                           SExt, SExt, SExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           ZExt, ZExt))
+                           ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt, SExt))
+                           SExt, SExt, SExt, SExt))
 __OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           ZExt, ZExt, ZExt))
+                           ZExt, ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
                 ParamAttrs(AttributeSet(), SExt))
 __OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3aa4f7ae04c33..b5feee34d884d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4565,6 +4565,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
   if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
   }
+  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
 
   Builder.CreateCall(RTLFn, RealArgs);
 }
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index a8759307b42bd..c5791ca3f11c3 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -768,7 +768,8 @@ template <typename Ty> class StaticLoopChunker {
 public:
   /// Worksharing `for`-loop.
   static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk,
+                  Ty OneIterationPerThread) {
     ASSERT(NumIters >= 0, "Bad iteration count");
     ASSERT(ThreadChunk >= 0, "Bad thread count");
 
@@ -790,12 +791,13 @@ template <typename Ty> class StaticLoopChunker {
 
     // If we know we have more threads than iterations we can indicate that to
     // avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeThreadsOversubscription()) {
-      ASSERT(NumThreads >= NumIters, "Broken assumption");
       OneIterationPerThread = true;
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+
     if (ThreadChunk != 1)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -807,7 +809,7 @@ template <typename Ty> class StaticLoopChunker {
 
   /// Worksharing `distribute`-loop.
   static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk) {
+                         Ty NumIters, Ty BlockChunk, Ty OneIterationPerThread) {
     ASSERT(icv::Level == 0, "Bad distribute");
     ASSERT(icv::ActiveLevel == 0, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -831,12 +833,13 @@ template <typename Ty> class StaticLoopChunker {
 
     // If we know we have more blocks than iterations we can indicate that to
     // avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeTeamsOversubscription()) {
-      ASSERT(NumBlocks >= NumIters, "Broken assumption");
       OneIterationPerThread = true;
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+
     if (BlockChunk != NumThreads)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -854,7 +857,8 @@ template <typename Ty> class StaticLoopChunker {
   /// Worksharing `distribute parallel for`-loop.
   static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
                             void *Arg, Ty NumIters, Ty NumThreads,
-                            Ty BlockChunk, Ty ThreadChunk) {
+                            Ty BlockChunk, Ty ThreadChunk,
+                            Ty OneIterationPerThread) {
     ASSERT(icv::Level == 1, "Bad distribute");
     ASSERT(icv::ActiveLevel == 1, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -882,13 +886,14 @@ template <typename Ty> class StaticLoopChunker {
 
     // If we know we have more threads (across all blocks) than iterations we
     // can indicate that to avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeTeamsOversubscription() &
         config::getAssumeThreadsOversubscription()) {
       OneIterationPerThread = true;
-      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+
     if (BlockChunk != NumThreads || ThreadChunk != 1)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -909,22 +914,24 @@ template <typename Ty> class StaticLoopChunker {
   [[gnu::flatten, clang::always_inline]] void                                  \
       __kmpc_distribute_for_static_loop##BW(                                   \
           IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
+          TY num_threads, TY block_chunk, TY thread_chunk,                     \
+          TY one_iteration_per_thread) {                                       \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
-        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk);      \
+        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
+        one_iteration_per_thread);                                             \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
-                                        void *arg, TY num_iters,               \
-                                        TY block_chunk) {                      \
-    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters,           \
-                                            block_chunk);                      \
+      __kmpc_distribute_static_loop##BW(                                       \
+          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
+          TY block_chunk, TY one_iteration_per_thread) {                       \
+    ompx::StaticLoopChunker<TY>::Distribute(                                   \
+        loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk) {                                       \
+      TY num_threads, TY thread_chunk, TY one_iteration_per_thread) {          \
     ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
-                                     thread_chunk);                            \
+                                     thread_chunk, one_iteration_per_thread);  \
   }
 
 extern "C" {

>From 9e8d671c84255b0262a34821d53f6987450f77ea Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 4 Aug 2025 08:25:16 -0500
Subject: [PATCH 2/5] Fixed format

---
 offload/DeviceRTL/src/Workshare.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index c5791ca3f11c3..a5f3ef2f3d940 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -912,18 +912,18 @@ template <typename Ty> class StaticLoopChunker {
 
 #define OMP_LOOP_ENTRY(BW, TY)                                                 \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_for_static_loop##BW(                                   \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk,                     \
-          TY one_iteration_per_thread) {                                       \
+  __kmpc_distribute_for_static_loop##BW(                                       \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk,                         \
+      TY one_iteration_per_thread) {                                           \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
         loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
         one_iteration_per_thread);                                             \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(                                       \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY block_chunk, TY one_iteration_per_thread) {                       \
+  __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
+                                    void *arg, TY num_iters, TY block_chunk,   \
+                                    TY one_iteration_per_thread) {             \
     ompx::StaticLoopChunker<TY>::Distribute(                                   \
         loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
   }                                                                            \

>From 54b77baeaea58647cf09399c14c5470e51ec925d Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 4 Aug 2025 09:23:34 -0500
Subject: [PATCH 3/5] Fixes

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 1 +
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b5feee34d884d..862d511b8c819 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4549,6 +4549,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
   RealArgs.push_back(LoopBodyArg);
   RealArgs.push_back(TripCount);
   if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
+    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
     Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
     Builder.CreateCall(RTLFn, RealArgs);
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index a9f913b744489..d5e79f340d338 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0
 // CHECK:   store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8
 // CHECK:   %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
-// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0)
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0, i32 0)
 
 // CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
 // CHECK:   %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0

>From cde647cae516df50a89c2ffff49e9a92b6d86d75 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 11 Aug 2025 06:00:43 -0500
Subject: [PATCH 4/5] Applied fixes

---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def | 24 +++++++-------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 +--
 .../LLVMIR/omptarget-parallel-wsloop.mlir     |  2 +-
 .../LLVMIR/omptarget-wsloop-collapsed.mlir    |  2 +-
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir |  4 +--
 offload/DeviceRTL/src/Workshare.cpp           | 32 ++++++++++++++++---
 6 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 85f42dc9db60e..e388f13c2c4d6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -470,18 +470,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
-__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int32)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
-__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 862d511b8c819..c7c9656194550 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4550,7 +4550,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
   RealArgs.push_back(TripCount);
   if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
-    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
     Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
     Builder.CreateCall(RTLFn, RealArgs);
     return;
@@ -4566,7 +4566,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
   if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
   }
-  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+  RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
 
   Builder.CreateCall(RTLFn, RealArgs);
 }
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
index 830610f12a5d2..5d2861a5d0f35 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK-SAME:  #[[ATTRS1:[0-9]+]]
 // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
 // CHECK-SAME:   ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
-// CHECK-SAME:   i32 %[[THREAD_NUM:.*]], i32 0)
+// CHECK-SAME:   i32 %[[THREAD_NUM:.*]], i8 0)
 
 // CHECK:      define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
index 0ebcec0e0ec31..b42e387acbb11 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -25,7 +25,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
 // CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
 // CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
-// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
+// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i8 0)
 
 // CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
 // CHECK:   %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index d5e79f340d338..7be635f46111b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0
 // CHECK:   store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8
 // CHECK:   %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
-// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0, i32 0)
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0, i8 0)
 
 // CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
 // CHECK:   %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0
@@ -46,6 +46,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4
 
 // CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]()
-// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0, i8 0)
 
 // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index a5f3ef2f3d940..f35b6fe94ab1b 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -767,9 +767,17 @@ template <typename Ty> class StaticLoopChunker {
 
 public:
   /// Worksharing `for`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] NumThreads Number of GPU threads
+  /// \param[in] ThreadChunk Size of thread chunk
+  /// \param[in] OneIterationPerThread Assume that one thread executes
+  ///                                  only one loop iter or one thread chunk
   static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
                   Ty NumIters, Ty NumThreads, Ty ThreadChunk,
-                  Ty OneIterationPerThread) {
+                  int8_t OneIterationPerThread) {
     ASSERT(NumIters >= 0, "Bad iteration count");
     ASSERT(ThreadChunk >= 0, "Bad thread count");
 
@@ -808,6 +816,13 @@ template <typename Ty> class StaticLoopChunker {
   }
 
   /// Worksharing `distribute`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] BlockChunk Size of block chunk
+  /// \param[in] OneIterationPerThread Assume that one thread executes
+  ///                                  only one loop iter or one block chunk
   static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
                          Ty NumIters, Ty BlockChunk, Ty OneIterationPerThread) {
     ASSERT(icv::Level == 0, "Bad distribute");
@@ -855,6 +870,15 @@ template <typename Ty> class StaticLoopChunker {
   }
 
   /// Worksharing `distribute parallel for`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] NumThreads Number of GPU threads
+  /// \param[in] BlockChunk Size of block chunk
+  /// \param[in] ThreadChunk Size of thread chunk
+  /// \param[in] OneIterationPerThread Assume that one thread executes
+  ///                                  only one loop iter or one block chunk
   static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
                             void *Arg, Ty NumIters, Ty NumThreads,
                             Ty BlockChunk, Ty ThreadChunk,
@@ -915,7 +939,7 @@ template <typename Ty> class StaticLoopChunker {
   __kmpc_distribute_for_static_loop##BW(                                       \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
       TY num_threads, TY block_chunk, TY thread_chunk,                         \
-      TY one_iteration_per_thread) {                                           \
+      int8_t one_iteration_per_thread) {                                       \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
         loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
         one_iteration_per_thread);                                             \
@@ -923,13 +947,13 @@ template <typename Ty> class StaticLoopChunker {
   [[gnu::flatten, clang::always_inline]] void                                  \
   __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
                                     void *arg, TY num_iters, TY block_chunk,   \
-                                    TY one_iteration_per_thread) {             \
+                                    int8_t one_iteration_per_thread) {         \
     ompx::StaticLoopChunker<TY>::Distribute(                                   \
         loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk, TY one_iteration_per_thread) {          \
+      TY num_threads, TY thread_chunk, int8_t one_iteration_per_thread) {      \
     ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
                                      thread_chunk, one_iteration_per_thread);  \
   }

>From 0407c7d878740a416ed109caa3b6bbb0a43feb79 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 11 Aug 2025 07:15:36 -0500
Subject: [PATCH 5/5] use zext

---
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def |  6 +++---
 offload/DeviceRTL/src/Workshare.cpp            | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index e388f13c2c4d6..5f0b6e9521395 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -674,19 +674,19 @@ __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
                 ParamAttrs(ReadOnlyPtrAttrs, SExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt, SExt, SExt, SExt))
+                           SExt, SExt, SExt, SExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
                            ZExt, ZExt, ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt, SExt))
+                           SExt, SExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
                            ZExt, ZExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
-                           SExt, SExt, SExt, SExt))
+                           SExt, SExt, SExt, ZExt))
 __OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
                            ZExt, ZExt, ZExt, ZExt))
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index f35b6fe94ab1b..1355f35a1f462 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -698,7 +698,7 @@ template <typename Ty> class StaticLoopChunker {
   static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
                                         Ty NumBlocks, Ty BId, Ty NumThreads,
                                         Ty TId, Ty NumIters,
-                                        bool OneIterationPerThread) {
+                                        uint8_t OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * NumThreads;
 
     // Start index in the normalized space.
@@ -729,7 +729,7 @@ template <typename Ty> class StaticLoopChunker {
                                         Ty BlockChunk, Ty NumBlocks, Ty BId,
                                         Ty ThreadChunk, Ty NumThreads, Ty TId,
                                         Ty NumIters,
-                                        bool OneIterationPerThread) {
+                                        uint8_t OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * BlockChunk;
 
     // Start index in the chunked space.
@@ -777,7 +777,7 @@ template <typename Ty> class StaticLoopChunker {
   ///                                  only one loop iter or one thread chunk
   static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
                   Ty NumIters, Ty NumThreads, Ty ThreadChunk,
-                  int8_t OneIterationPerThread) {
+                  uint8_t OneIterationPerThread) {
     ASSERT(NumIters >= 0, "Bad iteration count");
     ASSERT(ThreadChunk >= 0, "Bad thread count");
 
@@ -824,7 +824,7 @@ template <typename Ty> class StaticLoopChunker {
   /// \param[in] OneIterationPerThread Assume that one thread executes
   ///                                  only one loop iter or one block chunk
   static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk, Ty OneIterationPerThread) {
+                         Ty NumIters, Ty BlockChunk, uint8_t OneIterationPerThread) {
     ASSERT(icv::Level == 0, "Bad distribute");
     ASSERT(icv::ActiveLevel == 0, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -882,7 +882,7 @@ template <typename Ty> class StaticLoopChunker {
   static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
                             void *Arg, Ty NumIters, Ty NumThreads,
                             Ty BlockChunk, Ty ThreadChunk,
-                            Ty OneIterationPerThread) {
+                            uint8_t OneIterationPerThread) {
     ASSERT(icv::Level == 1, "Bad distribute");
     ASSERT(icv::ActiveLevel == 1, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -939,7 +939,7 @@ template <typename Ty> class StaticLoopChunker {
   __kmpc_distribute_for_static_loop##BW(                                       \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
       TY num_threads, TY block_chunk, TY thread_chunk,                         \
-      int8_t one_iteration_per_thread) {                                       \
+      uint8_t one_iteration_per_thread) {                                      \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
         loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
         one_iteration_per_thread);                                             \
@@ -947,13 +947,13 @@ template <typename Ty> class StaticLoopChunker {
   [[gnu::flatten, clang::always_inline]] void                                  \
   __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
                                     void *arg, TY num_iters, TY block_chunk,   \
-                                    int8_t one_iteration_per_thread) {         \
+                                    uint8_t one_iteration_per_thread) {        \
     ompx::StaticLoopChunker<TY>::Distribute(                                   \
         loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk, int8_t one_iteration_per_thread) {      \
+      TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) {     \
     ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
                                      thread_chunk, one_iteration_per_thread);  \
   }