[Mlir-commits] [llvm] [mlir] [openmp] [Flang][OpenMP] Add support for schedule clause for GPU (PR #81618)

Tue Feb 13 08:24:27 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-flang-openmp

Author: Dominik Adamski (DominikAdamski)

<details>
<summary>Changes</summary>

Scope of changes:
1) Fixed handling of loop chunking in OpenMP runtime.
2) Pass chunk value from MLIR code to OpenMP runtime.
3) Added explicit check that only static schedule is supported for target loops.

---
Full diff: https://github.com/llvm/llvm-project/pull/81618.diff


5 Files Affected:

- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+5-1) 
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+37-12) 
- (modified) mlir/test/Target/LLVMIR/omptarget-wsloop.mlir (+18) 
- (modified) openmp/libomptarget/DeviceRTL/src/Workshare.cpp (+11-6) 
- (added) openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 (+33) 


``````````diff

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..8d74b12dbc4ba1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
   ///                 preheader of the loop.
   /// \param LoopType Information about type of loop worksharing.
   ///                 It corresponds to type of loop workshare OpenMP pragma.
+  /// \param ScheduleType Information about scheduling type.
+  /// \param ChunkSize    Value of chunk size for static schedule.
   ///
   /// \returns Point where to insert code after the workshare construct.
   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
-                                         omp::WorksharingLoopType LoopType);
+                                         omp::WorksharingLoopType LoopType,
+                                         omp::OMPScheduleType ScheduleType,
+                                         Value *ChunkSize);
 
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..f9cbc39a24016d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+    Value *ThreadChunkSize) {
   Type *TripCountTy = TripCount->getType();
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
 
   RealArgs.push_back(
       Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
-  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
-  if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+  switch (LoopType) {
+  case WorksharingLoopType::DistributeForStaticLoop:
+    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::DistributeStaticLoop:
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::ForStaticLoop:
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
   }
 
   Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
                             CanonicalLoopInfo *CLI, Value *Ident,
                             Function &OutlinedFn, Type *ParallelTaskPtr,
                             const SmallVector<Instruction *, 4> &ToBeDeleted,
-                            WorksharingLoopType LoopType) {
+                            WorksharingLoopType LoopType, Value *ChunkSize) {
   IRBuilder<> &Builder = OMPIRBuilder->Builder;
   BasicBlock *Preheader = CLI->getPreheader();
   Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
 
   createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
                                 LoopBodyArg, ParallelTaskPtr, TripCount,
-                                OutlinedFn);
+                                OutlinedFn, ChunkSize);
 
   for (auto &ToBeDeletedItem : ToBeDeleted)
     ToBeDeletedItem->eraseFromParent();
   CLI->invalidate();
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                          InsertPointTy AllocaIP,
-                                          WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+    Value *ChunkSize) {
+
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
+  // TODO: Add support for dynamic scheduling
+  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+  case OMPScheduleType::BaseStatic:
+  case OMPScheduleType::BaseStaticChunked:
+    break;
+  default:
+    report_fatal_error(
+        "Unknown/unimplemented schedule kind for target workshare loop", false);
+  }
+
   OI.OuterAllocaBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   OI.PostOutlineCB = [=, ToBeDeletedVec =
                              std::move(ToBeDeleted)](Function &OutlinedFn) {
     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
-                                ToBeDeletedVec, LoopType);
+                                ToBeDeletedVec, LoopType, ChunkSize);
   };
   addOutlineInfo(std::move(OI));
   return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
     bool HasSimdModifier, bool HasMonotonicModifier,
     bool HasNonmonotonicModifier, bool HasOrderedClause,
     WorksharingLoopType LoopType) {
-  if (Config.isTargetDevice())
-    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
   OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
       SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
       HasNonmonotonicModifier, HasOrderedClause);
+  if (Config.isTargetDevice())
+    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+                                    EffectiveScheduleType, ChunkSize);
 
   bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
                    OMPScheduleType::ModifierOrdered;
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 220eb85b3483ec..a5f5d07262c8d9 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       }
     llvm.return
   }
+
+  llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      %chunk = llvm.mlir.constant(2 : i32) : i32
+      omp.wsloop schedule(static = %chunk : i32) for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+    llvm.return
+  }
 }
 
 // CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
 
 // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..836d4f7f4934b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker {
     Ty KernelIteration = NumBlocks * BlockChunk;
 
     // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
+    Ty IV = BId * BlockChunk + TId * ThreadChunk;
     ASSERT(IV >= 0, "Bad index");
 
+    // Make sure the starting index is within the kernel iteration boundaries.
+    if (IV >= KernelIteration)
+      return;
+
     // Cover the entire iteration space, assumptions in the caller might allow
     // to simplify this loop to a conditional.
     do {
 
       Ty BlockChunkLeft =
           BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
+      Ty EffectiveThreadChunk =
           ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+      Ty ThreadChunkLeft = EffectiveThreadChunk;
 
       while (ThreadChunkLeft--) {
 
@@ -711,8 +716,8 @@ template <typename Ty> class StaticLoopChunker {
 
         ++IV;
       }
-
-      IV += KernelIteration;
+      // Start the new kernel iteration before the first thread chunk
+      IV += (KernelIteration - EffectiveThreadChunk);
 
     } while (IV < NumIters);
   }
@@ -731,8 +736,8 @@ template <typename Ty> class StaticLoopChunker {
     // from the `omp` getter and not the mapping directly.
     Ty TId = omp_get_thread_num();
 
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
+    // There is only one block for the whole iteration space.
+    Ty BlockChunk = NumIters;
     Ty NumBlocks = 1;
     Ty BId = 0;
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
new file mode 100644
index 00000000000000..f0b444f6ddc66f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
@@ -0,0 +1,33 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+   use omp_lib
+   integer :: x(100)
+   integer :: errors = 0
+   integer :: i
+
+   !$omp target parallel do schedule(static, 5) map(from: x)
+   do i = 1, 100
+       x(i) = i
+   end do
+   !$omp end target parallel do
+   do i = 1, 100
+       if ( x(i) .ne. i ) then
+           errors = errors + 1
+       end if
+   end do
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  number of errors: 0

``````````

</details>


https://github.com/llvm/llvm-project/pull/81618