[llvm-branch-commits] [mlir] [MLIR][OpenMP] Host lowering of distribute-parallel-do/for (PR #127819)

Fri Feb 21 02:07:54 PST 2025

https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/127819

>From 33d5af4e9d8aaf9464aa74f5031d60001d77c610 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof at amd.com>
Date: Tue, 18 Feb 2025 13:07:51 +0000
Subject: [PATCH] [MLIR][OpenMP] Host lowering of distribute-parallel-do/for

This patch adds support for translating composite `omp.parallel` +
`omp.distribute` + `omp.wsloop` loops to LLVM IR on the host. This is done by
passing an updated `WorksharingLoopType` to the call to `applyWorkshareLoop`
associated to the lowering of the `omp.wsloop` operation, so that
`__kmpc_dist_for_static_init` is called at runtime in place of
`__kmpc_for_static_init`.

Existing translation rules take care of creating a parallel region to hold the
workshared and workdistributed loop.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 21 ++++--
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      | 65 +++++++++++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      | 19 ------
 3 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 987f18fc7bc47..fbea278b2511f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -257,10 +257,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
   LogicalResult result = success();
   llvm::TypeSwitch<Operation &>(op)
       .Case([&](omp::DistributeOp op) {
-        if (op.isComposite() &&
-            isa_and_present<omp::WsloopOp>(op.getNestedWrapper()))
-          result = op.emitError() << "not yet implemented: "
-                                     "composite omp.distribute + omp.wsloop";
         checkAllocate(op, result);
         checkDistSchedule(op, result);
         checkOrder(op, result);
@@ -1990,6 +1986,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   bool isSimd = wsloopOp.getScheduleSimd();
   bool loopNeedsBarrier = !wsloopOp.getNowait();
 
+  // The only legal way for the direct parent to be omp.distribute is that this
+  // represents 'distribute parallel do'. Otherwise, this is a regular
+  // worksharing loop.
+  llvm::omp::WorksharingLoopType workshareLoopType =
+      llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())
+          ? llvm::omp::WorksharingLoopType::DistributeForStaticLoop
+          : llvm::omp::WorksharingLoopType::ForStaticLoop;
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
       wsloopOp.getRegion(), "omp.wsloop.region", builder, moduleTranslation);
@@ -2005,7 +2009,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
           ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
           convertToScheduleKind(schedule), chunk, isSimd,
           scheduleMod == omp::ScheduleModifier::monotonic,
-          scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
+          scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
+          workshareLoopType);
 
   if (failed(handleError(wsloopIP, opInst)))
     return failure();
@@ -3791,6 +3796,12 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
       return regionBlock.takeError();
     builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
 
+    // Skip applying a workshare loop below when translating 'distribute
+    // parallel do' (it's been already handled by this point while translating
+    // the nested omp.wsloop).
+    if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
+      return llvm::Error::success();
+
     // TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
     // Static schedule is the default.
     auto schedule = omp::ClauseScheduleKind::Static;
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index a5a490e527d79..d85b149c66811 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3307,3 +3307,68 @@ llvm.func @distribute() {
 // CHECK:         store i64 1, ptr %[[STRIDE]]
 // CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
 // CHECK:         call void @__kmpc_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 92, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[STRIDE]], i64 1, i64 0)
+
+// -----
+
+llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
+  omp.parallel {
+    omp.distribute {
+      omp.wsloop {
+        omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+          omp.yield
+        }
+      } {omp.composite}
+    } {omp.composite}
+    omp.terminator
+  } {omp.composite}
+  llvm.return
+}
+
+// CHECK-LABEL: define void @distribute_wsloop
+// CHECK:         call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
+
+// CHECK:       define internal void @[[OUTLINED_PARALLEL]]({{.*}})
+// CHECK:         %[[ARGS:.*]] = alloca { i32, i32, i32, ptr, ptr, ptr, ptr }
+// CHECK:         %[[LASTITER_ALLOC:.*]] = alloca i32
+// CHECK:         %[[LB_ALLOC:.*]] = alloca i32
+// CHECK:         %[[UB_ALLOC:.*]] = alloca i32
+// CHECK:         %[[STRIDE_ALLOC:.*]] = alloca i32
+// CHECK:         %[[LB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 3
+// CHECK:         store ptr %[[LB_ALLOC]], ptr %[[LB_ARG]]
+// CHECK:         %[[UB_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 4
+// CHECK:         store ptr %[[UB_ALLOC]], ptr %[[UB_ARG]]
+// CHECK:         %[[STRIDE_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 5
+// CHECK:         store ptr %[[STRIDE_ALLOC]], ptr %[[STRIDE_ARG]]
+// CHECK:         %[[LASTITER_ARG:.*]] = getelementptr {{.*}}, ptr %[[ARGS]], i32 0, i32 6
+// CHECK:         store ptr %[[LASTITER_ALLOC]], ptr %[[LASTITER_ARG]]
+// CHECK:         call void @[[OUTLINED_DISTRIBUTE:.*]](ptr %[[ARGS]])
+
+// CHECK:       define internal void @[[OUTLINED_DISTRIBUTE]](ptr %[[ARGS_STRUCT:.*]])
+// CHECK:         %[[LB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 3
+// CHECK:         %[[LB:.*]] = load ptr, ptr %[[LB_PTR]]
+// CHECK:         %[[UB_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 4
+// CHECK:         %[[UB:.*]] = load ptr, ptr %[[UB_PTR]]
+// CHECK:         %[[STRIDE_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 5
+// CHECK:         %[[STRIDE:.*]] = load ptr, ptr %[[STRIDE_PTR]]
+// CHECK:         %[[LASTITER_PTR:.*]] = getelementptr {{.*}}, ptr %[[ARGS_STRUCT]], i32 0, i32 6
+// CHECK:         %[[LASTITER:.*]] = load ptr, ptr %[[LASTITER_PTR]]
+// CHECK:         br label %[[DISTRIBUTE_BODY:.*]]
+
+// CHECK:       [[DISTRIBUTE_BODY]]:
+// CHECK-NEXT:    br label %[[DISTRIBUTE_REGION:.*]]
+
+// CHECK:       [[DISTRIBUTE_REGION]]:
+// CHECK-NEXT:    br label %[[WSLOOP_REGION:.*]]
+
+// CHECK:       [[WSLOOP_REGION]]:
+// CHECK:         %omp_loop.tripcount = select {{.*}}
+// CHECK-NEXT:    br label %[[PREHEADER:.*]]
+
+// CHECK:       [[PREHEADER]]:
+// CHECK:         store i32 0, ptr %[[LB]]
+// CHECK:         %[[TRIPCOUNT:.*]] = sub i32 %omp_loop.tripcount, 1
+// CHECK:         store i32 %[[TRIPCOUNT]], ptr %[[UB]]
+// CHECK:         store i32 1, ptr %[[STRIDE]]
+// CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK:         %[[DIST_UB:.*]] = alloca i32
+// CHECK:         call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 71dbc061c3104..d1c745af9bff5 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -66,25 +66,6 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
 
 // -----
 
-llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
-  // expected-error at below {{LLVM Translation failed for operation: omp.parallel}}
-  omp.parallel {
-    // expected-error at below {{not yet implemented: composite omp.distribute + omp.wsloop}}
-    // expected-error at below {{LLVM Translation failed for operation: omp.distribute}}
-    omp.distribute {
-      omp.wsloop {
-        omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-          omp.yield
-        }
-      } {omp.composite}
-    } {omp.composite}
-    omp.terminator
-  } {omp.composite}
-  llvm.return
-}
-
-// -----
-
 llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
   // expected-error at below {{not yet implemented: Unhandled clause allocate in omp.distribute operation}}
   // expected-error at below {{LLVM Translation failed for operation: omp.distribute}}