[llvm] [mlir] [openmp] [Flang][OpenMP] Add support for schedule clause for GPU (PR #81618)

Tue Feb 13 08:23:59 PST 2024

https://github.com/DominikAdamski created https://github.com/llvm/llvm-project/pull/81618

Scope of changes:
1) Fixed handling of loop chunking in OpenMP runtime.
2) Pass chunk value from MLIR code to OpenMP runtime.
3) Added explicit check that only static schedule is supported for target loops.

>From df9980af2f70634b4de13ec505b4af607bf24a90 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Fri, 9 Feb 2024 08:16:06 -0600
Subject: [PATCH] [Flang][OpenMP] Add support for schedule clause for GPU

Scope of changes:
1) Fixed handling of loop chunking in OpenMP runtime.
2) Pass chunk value from MLIR to OpenMP runtime.
3) Added explicit check that only static schedule is supported
   for target loops.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  6 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 49 ++++++++++++++-----
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 18 +++++++
 .../libomptarget/DeviceRTL/src/Workshare.cpp  | 17 ++++---
 ...rget-parallel-do-schedule-static-chunk.f90 | 33 +++++++++++++
 5 files changed, 104 insertions(+), 19 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..8d74b12dbc4ba1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
   ///                 preheader of the loop.
   /// \param LoopType Information about type of loop worksharing.
   ///                 It corresponds to type of loop workshare OpenMP pragma.
+  /// \param ScheduleType Information about scheduling type.
+  /// \param ChunkSize    Value of chunk size for static schedule.
   ///
   /// \returns Point where to insert code after the workshare construct.
   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
-                                         omp::WorksharingLoopType LoopType);
+                                         omp::WorksharingLoopType LoopType,
+                                         omp::OMPScheduleType ScheduleType,
+                                         Value *ChunkSize);
 
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..f9cbc39a24016d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+    Value *ThreadChunkSize) {
   Type *TripCountTy = TripCount->getType();
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
 
   RealArgs.push_back(
       Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
-  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
-  if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+  switch (LoopType) {
+  case WorksharingLoopType::DistributeForStaticLoop:
+    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::DistributeStaticLoop:
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::ForStaticLoop:
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
   }
 
   Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
                             CanonicalLoopInfo *CLI, Value *Ident,
                             Function &OutlinedFn, Type *ParallelTaskPtr,
                             const SmallVector<Instruction *, 4> &ToBeDeleted,
-                            WorksharingLoopType LoopType) {
+                            WorksharingLoopType LoopType, Value *ChunkSize) {
   IRBuilder<> &Builder = OMPIRBuilder->Builder;
   BasicBlock *Preheader = CLI->getPreheader();
   Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
 
   createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
                                 LoopBodyArg, ParallelTaskPtr, TripCount,
-                                OutlinedFn);
+                                OutlinedFn, ChunkSize);
 
   for (auto &ToBeDeletedItem : ToBeDeleted)
     ToBeDeletedItem->eraseFromParent();
   CLI->invalidate();
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                          InsertPointTy AllocaIP,
-                                          WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+    Value *ChunkSize) {
+
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
+  // TODO: Add support for dynamic scheduling
+  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+  case OMPScheduleType::BaseStatic:
+  case OMPScheduleType::BaseStaticChunked:
+    break;
+  default:
+    report_fatal_error(
+        "Unknown/unimplemented schedule kind for target workshare loop", false);
+  }
+
   OI.OuterAllocaBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   OI.PostOutlineCB = [=, ToBeDeletedVec =
                              std::move(ToBeDeleted)](Function &OutlinedFn) {
     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
-                                ToBeDeletedVec, LoopType);
+                                ToBeDeletedVec, LoopType, ChunkSize);
   };
   addOutlineInfo(std::move(OI));
   return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
     bool HasSimdModifier, bool HasMonotonicModifier,
     bool HasNonmonotonicModifier, bool HasOrderedClause,
     WorksharingLoopType LoopType) {
-  if (Config.isTargetDevice())
-    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
   OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
       SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
       HasNonmonotonicModifier, HasOrderedClause);
+  if (Config.isTargetDevice())
+    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+                                    EffectiveScheduleType, ChunkSize);
 
   bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
                    OMPScheduleType::ModifierOrdered;
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 220eb85b3483ec..a5f5d07262c8d9 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       }
     llvm.return
   }
+
+  llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      %chunk = llvm.mlir.constant(2 : i32) : i32
+      omp.wsloop schedule(static = %chunk : i32) for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+    llvm.return
+  }
 }
 
 // CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
 
 // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..836d4f7f4934b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker {
     Ty KernelIteration = NumBlocks * BlockChunk;
 
     // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
+    Ty IV = BId * BlockChunk + TId * ThreadChunk;
     ASSERT(IV >= 0, "Bad index");
 
+    // Make sure the starting index is within the kernel iteration boundaries.
+    if (IV >= KernelIteration)
+      return;
+
     // Cover the entire iteration space, assumptions in the caller might allow
     // to simplify this loop to a conditional.
     do {
 
       Ty BlockChunkLeft =
           BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
+      Ty EffectiveThreadChunk =
           ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+      Ty ThreadChunkLeft = EffectiveThreadChunk;
 
       while (ThreadChunkLeft--) {
 
@@ -711,8 +716,8 @@ template <typename Ty> class StaticLoopChunker {
 
         ++IV;
       }
-
-      IV += KernelIteration;
+      // Start the new kernel iteration before the first thread chunk
+      IV += (KernelIteration - EffectiveThreadChunk);
 
     } while (IV < NumIters);
   }
@@ -731,8 +736,8 @@ template <typename Ty> class StaticLoopChunker {
     // from the `omp` getter and not the mapping directly.
     Ty TId = omp_get_thread_num();
 
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
+    // There is only one block for the whole iteration space.
+    Ty BlockChunk = NumIters;
     Ty NumBlocks = 1;
     Ty BId = 0;
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
new file mode 100644
index 00000000000000..f0b444f6ddc66f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
@@ -0,0 +1,33 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+   use omp_lib
+   integer :: x(100)
+   integer :: errors = 0
+   integer :: i
+
+   !$omp target parallel do schedule(static, 5) map(from: x)
+   do i = 1, 100
+       x(i) = i
+   end do
+   !$omp end target parallel do
+   do i = 1, 100
+       if ( x(i) .ne. i ) then
+           errors = errors + 1
+       end if
+   end do
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  number of errors: 0