[Openmp-commits] [llvm] [openmp] [OpenMP] New Openmp device RTL functions (PR #73225)

Thu Nov 23 02:00:32 PST 2023

https://github.com/DominikAdamski created https://github.com/llvm/llvm-project/pull/73225

Add new implementation of workshare loop functions.

>From 72aa7101443d0a841a7982d50ea407ab78bcbb3b Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 5 Sep 2023 02:43:46 -0400
Subject: [PATCH 1/3] [OpenMP] Implement new Worksharing and Distribute OpenMP
 DeviceRTL functions

---
 .../libomptarget/DeviceRTL/src/Workshare.cpp  | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)

diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0dbfafc4d699e7e4..20eefdc9ce40c325 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
 
 #pragma omp begin declare target device_type(nohost)
 
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -636,4 +639,253 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
 }
 
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// absence of user specified chunk sizes. This implicitly picks a block chunk
+  /// size equal to the number of threads in the block and a thread chunk size
+  /// equal to one. In contrast to the chunked version we can get away with a
+  /// single loop in this case
+  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty NumBlocks, Ty BId, Ty NumThreads,
+                                        Ty TId, Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * NumThreads;
+
+    // Start index in the normalized space.
+    Ty IV = BId * NumThreads + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    if (IV < NumIters) {
+      do {
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        // Every thread executed one block and thread chunk now.
+        IV += KernelIteration;
+
+        if (OneIterationPerThread)
+          return;
+
+      } while (IV < NumIters);
+    }
+  }
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// presence of user specified chunk sizes (for at least one of them).
+  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
+                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
+                                        Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * BlockChunk;
+
+    // Start index in the chunked space.
+    Ty IV = BId * BlockChunk + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    do {
+
+      Ty BlockChunkLeft =
+          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+      Ty ThreadChunkLeft =
+          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+      while (ThreadChunkLeft--) {
+
+        // Given the blocking it's hard to keep track of what to execute.
+        if (IV >= NumIters)
+          return;
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        if (OneIterationPerThread)
+          return;
+
+        ++IV;
+      }
+
+      IV += KernelIteration;
+
+    } while (IV < NumIters);
+  }
+
+public:
+  /// Worksharing `for`-loop.
+  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but we don't know if we are in a
+    // parallel at all or if the user might have used a `num_threads` clause
+    // on the parallel and reduced the number compared to the block size.
+    // Since nested parallels are possible too we need to get the thread id
+    // from the `omp` getter and not the mapping directly.
+    Ty TId = omp_get_thread_num();
+
+    // There are no blocks involved here.
+    Ty BlockChunk = 0;
+    Ty NumBlocks = 1;
+    Ty BId = 0;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_threads_oversubscription) {
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+  }
+
+  /// Worksharing `distrbute`-loop.
+  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                         Ty NumIters, Ty BlockChunk) {
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+
+    // There are no threads involved here.
+    Ty ThreadChunk = 0;
+    Ty NumThreads = 1;
+    Ty TId = 0;
+    ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocks();
+    Ty BId = mapping::getBlockId();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If we know we have more blocks than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription) {
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (BlockChunk != NumThreads)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+  }
+
+  /// Worksharing `distrbute parallel for`-loop.
+  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+                            void *Arg, Ty NumIters, Ty NumThreads,
+                            Ty BlockChunk, Ty ThreadChunk) {
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but the user might have used a
+    // `num_threads` clause on the parallel and reduced the number compared to
+    // the block size.
+    Ty TId = mapping::getThreadIdInBlock();
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocks();
+    Ty BId = mapping::getBlockId();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads (across all blocks) than iterations we
+    // can indicate that to avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription &
+        __omp_rtl_assume_threads_oversubscription) {
+      OneIterationPerThread = true;
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+    }
+
+    if (BlockChunk != NumThreads || ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+  }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY)                                                 \
+  __attribute__((flatten)) void __kmpc_distribute_for_static_loop##BW(         \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk) {                       \
+    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
+        loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk);  \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_distribute_static_loop##BW(             \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY block_chunk) {                                                        \
+    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1,       \
+                                            block_chunk);                      \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_for_static_loop##BW(                    \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY thread_chunk) {                                       \
+    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+                                     thread_chunk);                            \
+  }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
 #pragma omp end declare target

>From 8b87ac99c6503d40dcbccb83628395a7997f20fa Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Wed, 22 Nov 2023 07:20:49 -0600
Subject: [PATCH 2/3] [OpenMP] Fix build issues

---
 openmp/libomptarget/DeviceRTL/src/Workshare.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 20eefdc9ce40c325..da743884ccf7ced1 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -778,8 +778,8 @@ template <typename Ty> class StaticLoopChunker {
     ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
 
     // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocks();
-    Ty BId = mapping::getBlockId();
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
 
     // If the block chunk is not specified we pick a default now.
     if (BlockChunk == 0)
@@ -825,8 +825,8 @@ template <typename Ty> class StaticLoopChunker {
     Ty TId = mapping::getThreadIdInBlock();
 
     // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocks();
-    Ty BId = mapping::getBlockId();
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
 
     // If the block chunk is not specified we pick a default now.
     if (BlockChunk == 0)

>From e5eaff37ef78460ce5182ab0258fef0ab56e9d89 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Wed, 22 Nov 2023 07:21:10 -0600
Subject: [PATCH 3/3] [OpenMP] Add new OpenMP function definitions

---
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 6a719d2311422306..04c926004f72ef97 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)