[Mlir-commits] [llvm] [mlir] [openmp] [Flang][OpenMP] Add support for schedule clause for GPU (PR #81618)
Dominik Adamski
llvmlistbot at llvm.org
Fri Mar 1 11:01:44 PST 2024
https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/81618
>From df9980af2f70634b4de13ec505b4af607bf24a90 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Fri, 9 Feb 2024 08:16:06 -0600
Subject: [PATCH 1/2] [Flang][OpenMP] Add support for schedule clause for GPU
Scope of changes:
1) Fixed handling of loop chunking in OpenMP runtime.
2) Pass chunk value from MLIR to OpenMP runtime.
3) Added explicit check that only static schedule is supported
for target loops.
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 6 ++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 49 ++++++++++++++-----
mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 18 +++++++
.../libomptarget/DeviceRTL/src/Workshare.cpp | 17 ++++---
...rget-parallel-do-schedule-static-chunk.f90 | 33 +++++++++++++
5 files changed, 104 insertions(+), 19 deletions(-)
create mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..8d74b12dbc4ba1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param LoopType Information about type of loop worksharing.
/// It corresponds to type of loop workshare OpenMP pragma.
+ /// \param ScheduleType Information about scheduling type.
+ /// \param ChunkSize Value of chunk size for static schedule.
///
/// \returns Point where to insert code after the workshare construct.
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
- omp::WorksharingLoopType LoopType);
+ omp::WorksharingLoopType LoopType,
+ omp::OMPScheduleType ScheduleType,
+ Value *ChunkSize);
/// Modifies the canonical loop to be a statically-scheduled workshare loop.
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..f9cbc39a24016d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
- Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+ Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+ Value *ThreadChunkSize) {
Type *TripCountTy = TripCount->getType();
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
RealArgs.push_back(
Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
- RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
- if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+ switch (LoopType) {
+ case WorksharingLoopType::DistributeForStaticLoop:
+ RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+ ThreadChunkSize, TripCountTy))
+ : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
+ case WorksharingLoopType::DistributeStaticLoop:
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
+ case WorksharingLoopType::ForStaticLoop:
+ ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+ ThreadChunkSize, TripCountTy))
+ : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
}
Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
CanonicalLoopInfo *CLI, Value *Ident,
Function &OutlinedFn, Type *ParallelTaskPtr,
const SmallVector<Instruction *, 4> &ToBeDeleted,
- WorksharingLoopType LoopType) {
+ WorksharingLoopType LoopType, Value *ChunkSize) {
IRBuilder<> &Builder = OMPIRBuilder->Builder;
BasicBlock *Preheader = CLI->getPreheader();
Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
LoopBodyArg, ParallelTaskPtr, TripCount,
- OutlinedFn);
+ OutlinedFn, ChunkSize);
for (auto &ToBeDeletedItem : ToBeDeleted)
ToBeDeletedItem->eraseFromParent();
CLI->invalidate();
}
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP,
- WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+ DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+ WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+ Value *ChunkSize) {
+
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
// Instructions which need to be deleted at the end of code generation
SmallVector<Instruction *, 4> ToBeDeleted;
+ // TODO: Add support for dynamic scheduling
+ switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+ case OMPScheduleType::BaseStatic:
+ case OMPScheduleType::BaseStaticChunked:
+ break;
+ default:
+ report_fatal_error(
+ "Unknown/unimplemented schedule kind for target workshare loop", false);
+ }
+
OI.OuterAllocaBB = AllocaIP.getBlock();
// Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
OI.PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
- ToBeDeletedVec, LoopType);
+ ToBeDeletedVec, LoopType, ChunkSize);
};
addOutlineInfo(std::move(OI));
return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
bool HasSimdModifier, bool HasMonotonicModifier,
bool HasNonmonotonicModifier, bool HasOrderedClause,
WorksharingLoopType LoopType) {
- if (Config.isTargetDevice())
- return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
HasNonmonotonicModifier, HasOrderedClause);
+ if (Config.isTargetDevice())
+ return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+ EffectiveScheduleType, ChunkSize);
bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
OMPScheduleType::ModifierOrdered;
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 220eb85b3483ec..a5f5d07262c8d9 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
}
llvm.return
}
+
+ llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+ %loop_ub = llvm.mlir.constant(9 : i32) : i32
+ %loop_lb = llvm.mlir.constant(0 : i32) : i32
+ %loop_step = llvm.mlir.constant(1 : i32) : i32
+ %chunk = llvm.mlir.constant(2 : i32) : i32
+ omp.wsloop schedule(static = %chunk : i32) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+ %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+ llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+ omp.yield
+ }
+ llvm.return
+ }
}
// CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..836d4f7f4934b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker {
Ty KernelIteration = NumBlocks * BlockChunk;
// Start index in the chunked space.
- Ty IV = BId * BlockChunk + TId;
+ Ty IV = BId * BlockChunk + TId * ThreadChunk;
ASSERT(IV >= 0, "Bad index");
+ // Make sure the starting index is within the kernel iteration boundaries.
+ if (IV >= KernelIteration)
+ return;
+
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {
Ty BlockChunkLeft =
BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
- Ty ThreadChunkLeft =
+ Ty EffectiveThreadChunk =
ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+ Ty ThreadChunkLeft = EffectiveThreadChunk;
while (ThreadChunkLeft--) {
@@ -711,8 +716,8 @@ template <typename Ty> class StaticLoopChunker {
++IV;
}
-
- IV += KernelIteration;
+ // Start the new kernel iteration before the first thread chunk
+ IV += (KernelIteration - EffectiveThreadChunk);
} while (IV < NumIters);
}
@@ -731,8 +736,8 @@ template <typename Ty> class StaticLoopChunker {
// from the `omp` getter and not the mapping directly.
Ty TId = omp_get_thread_num();
- // There are no blocks involved here.
- Ty BlockChunk = 0;
+ // There is only one block for the whole iteration space.
+ Ty BlockChunk = NumIters;
Ty NumBlocks = 1;
Ty BId = 0;
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
new file mode 100644
index 00000000000000..f0b444f6ddc66f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
@@ -0,0 +1,33 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+ use omp_lib
+ integer :: x(100)
+ integer :: errors = 0
+ integer :: i
+
+ !$omp target parallel do schedule(static, 5) map(from: x)
+ do i = 1, 100
+ x(i) = i
+ end do
+ !$omp end target parallel do
+ do i = 1, 100
+ if ( x(i) .ne. i ) then
+ errors = errors + 1
+ end if
+ end do
+
+ print *,"number of errors: ", errors
+
+end program main
+
+! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK: number of errors: 0
>From b7b16308d9c56766ef1735c1d7bec2575c3102bf Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Tue, 27 Feb 2024 06:50:08 -0600
Subject: [PATCH 2/2] Adjust chunking policy
---
.../libomptarget/DeviceRTL/src/Workshare.cpp | 57 +++++++++----------
...rget-parallel-do-schedule-static-chunk.f90 | 33 -----------
.../target_workshare_loop_static_chunk.f90 | 46 +++++++++++++++
3 files changed, 72 insertions(+), 64 deletions(-)
delete mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
create mode 100644 openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 836d4f7f4934b4..ee9ee9a14056d8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -683,43 +683,38 @@ template <typename Ty> class StaticLoopChunker {
Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * BlockChunk;
+ Ty BlockIV = BId * BlockChunk;
- // Start index in the chunked space.
- Ty IV = BId * BlockChunk + TId * ThreadChunk;
- ASSERT(IV >= 0, "Bad index");
-
- // Make sure the starting index is within the kernel iteration boundaries.
- if (IV >= KernelIteration)
- return;
-
+ ASSERT((BlockIV + TId * ThreadChunk) >= 0, "Bad index");
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {
-
- Ty BlockChunkLeft =
- BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
- Ty EffectiveThreadChunk =
- ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
- Ty ThreadChunkLeft = EffectiveThreadChunk;
-
- while (ThreadChunkLeft--) {
-
- // Given the blocking it's hard to keep track of what to execute.
- if (IV >= NumIters)
- return;
-
- // Execute the loop body.
- LoopBody(IV, Arg);
-
- if (OneIterationPerThread)
- return;
-
- ++IV;
+ Ty ThreadIV = TId * ThreadChunk;
+ // Cover the block space
+ while (ThreadIV < BlockChunk) {
+ Ty ThreadCnt = 0;
+ // Cover the thread space
+ while ((ThreadCnt < ThreadChunk) &&
+ ((ThreadIV + ThreadCnt) < BlockChunk)) {
+ // Index in the chunked space.
+ Ty IV = BlockIV + ThreadIV + ThreadCnt;
+
+ // Given the blocking it's hard to keep track of what to execute.
+ if (IV >= NumIters)
+ return;
+
+ // Execute the loop body.
+ LoopBody(IV, Arg);
+
+ if (OneIterationPerThread)
+ return;
+ ++ThreadCnt;
+ };
+ ThreadIV += (NumThreads * ThreadChunk);
}
- // Start the new kernel iteration before the first thread chunk
- IV += (KernelIteration - EffectiveThreadChunk);
- } while (IV < NumIters);
+ BlockIV += KernelIteration;
+ } while (BlockIV < NumIters);
}
public:
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
deleted file mode 100644
index f0b444f6ddc66f..00000000000000
--- a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
+++ /dev/null
@@ -1,33 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-generic
-! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
-program main
- use omp_lib
- integer :: x(100)
- integer :: errors = 0
- integer :: i
-
- !$omp target parallel do schedule(static, 5) map(from: x)
- do i = 1, 100
- x(i) = i
- end do
- !$omp end target parallel do
- do i = 1, 100
- if ( x(i) .ne. i ) then
- errors = errors + 1
- end if
- end do
-
- print *,"number of errors: ", errors
-
-end program main
-
-! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
-! CHECK: number of errors: 0
diff --git a/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
new file mode 100644
index 00000000000000..6b539e7a3837be
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
@@ -0,0 +1,46 @@
+! Offloading test with a target region and chunks
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+program main
+ use omp_lib
+ integer :: A(100)
+!$omp target map(from:A)
+!$omp parallel do schedule(static,2) num_threads(10)
+ do index_ = 1, 100
+ A(index_) = omp_get_team_num() * 1000 + omp_get_thread_num()
+ end do
+!$omp end target
+ write(*,"(A)"), "omp target parallel for thread chunk size 2"
+ call printArray(A)
+
+end program main
+
+subroutine printArray(Array)
+ integer :: Array(*)
+ do i = 1, 100
+ write(*, "(A, I0, A, I0, A)", advance="no") "B",Array(i)/1000,"T",modulo(Array(i),1000)," "
+ end do
+ write(*,'(/)')
+end subroutine printArray
+
+!CHECK: omp target parallel for thread chunk size 2
+
+!CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+
More information about the Mlir-commits
mailing list