[Mlir-commits] [mlir] 973cb2c - [MLIR][OMP] Ensure nested scf.parallel execute all iterations

Fri Aug 20 16:06:33 PDT 2021

Author: William S. Moses
Date: 2021-08-20T19:06:28-04:00
New Revision: 973cb2c326be9f256da0897c4d2ef117dc22761d

URL: https://github.com/llvm/llvm-project/commit/973cb2c326be9f256da0897c4d2ef117dc22761d
DIFF: https://github.com/llvm/llvm-project/commit/973cb2c326be9f256da0897c4d2ef117dc22761d.diff

LOG: [MLIR][OMP] Ensure nested scf.parallel execute all iterations

Presently, the lowering of nested scf.parallel loops to OpenMP creates one omp.parallel region, with two (nested) OpenMP worksharing loops on the inside. When lowered to LLVM and executed, this results in incorrect results. The reason for this is as follows:

An OpenMP parallel region results in the code being run with whatever number of threads available to OpenMP. Within a parallel region a worksharing loop divides up the total number of requested iterations by the available number of threads, and distributes accordingly. For a single ws loop in a parallel region, this works as intended.

Now consider nested ws loops as follows:

omp.parallel {
   A: omp.ws %i = 0...10 {
      B: omp.ws %j = 0...10 {
          code(%i, %j)
      }
   }
}

Suppose we ran this on two threads. The first workshare loop would decide to execute iterations 0, 1, 2, 3, 4 on thread 0, and iterations 5, 6, 7, 8, 9 on thread 1. The second workshare loop would decide the same for its iteration. This means thread 0 would execute i \in [0, 5) and j \in [0, 5). Thread 1 would execute i \in [5, 10) and j \in [5, 10). This means that iterations i in [5, 10), j in [0, 5) and i in [0, 5), j in [5, 10) never get executed, which is clearly wrong.

This permits two options for a remedy:
1) Change the semantics of the omp.wsloop to be distinct from that of the OpenMP runtime call or equivalently #pragma omp for. This could then allow some lowering transformation to remedy the aforementioned issue. I don't think this is desirable for an abstraction standpoint.
2) When lowering an scf.parallel always surround the wsloop with a new parallel region (thereby causing the innermost wsloop to use the number of threads available only to it).

This PR implements the latter change.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D108426

Added: 
    

Modified: 
    mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
    mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 1dedc2c39d8f6..a7d4a99c9d5b5 100644

--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -44,44 +44,21 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     }
 
     // Replace the loop.
+    auto omp = rewriter.create<omp::ParallelOp>(parallelOp.getLoc());
+    Block *block = rewriter.createBlock(&omp.getRegion());
+    rewriter.setInsertionPointToStart(block);
     auto loop = rewriter.create<omp::WsLoopOp>(
         parallelOp.getLoc(), parallelOp.lowerBound(), parallelOp.upperBound(),
         parallelOp.step());
     rewriter.inlineRegionBefore(parallelOp.region(), loop.region(),
                                 loop.region().begin());
+    rewriter.create<omp::TerminatorOp>(parallelOp.getLoc());
+
     rewriter.eraseOp(parallelOp);
     return success();
   }
 };
 
-/// Inserts OpenMP "parallel" operations around top-level SCF "parallel"
-/// operations in the given function. This is implemented as a direct IR
-/// modification rather than as a conversion pattern because it does not
-/// modify the top-level operation it matches, which is a requirement for
-/// rewrite patterns.
-//
-// TODO: consider creating nested parallel operations when necessary.
-static void insertOpenMPParallel(FuncOp func) {
-  // Collect top-level SCF "parallel" ops.
-  SmallVector<scf::ParallelOp, 4> topLevelParallelOps;
-  func.walk([&topLevelParallelOps](scf::ParallelOp parallelOp) {
-    // Ignore ops that are already within OpenMP parallel construct.
-    if (!parallelOp->getParentOfType<scf::ParallelOp>())
-      topLevelParallelOps.push_back(parallelOp);
-  });
-
-  // Wrap SCF ops into OpenMP "parallel" ops.
-  for (scf::ParallelOp parallelOp : topLevelParallelOps) {
-    OpBuilder builder(parallelOp);
-    auto omp = builder.create<omp::ParallelOp>(parallelOp.getLoc());
-    Block *block = builder.createBlock(&omp.getRegion());
-    builder.create<omp::TerminatorOp>(parallelOp.getLoc());
-    block->getOperations().splice(block->begin(),
-                                  parallelOp->getBlock()->getOperations(),
-                                  parallelOp.getOperation());
-  }
-}
-
 /// Applies the conversion patterns in the given function.
 static LogicalResult applyPatterns(FuncOp func) {
   ConversionTarget target(*func.getContext());
@@ -100,7 +77,6 @@ static LogicalResult applyPatterns(FuncOp func) {
 struct SCFToOpenMPPass : public ConvertSCFToOpenMPBase<SCFToOpenMPPass> {
   /// Pass entry point.
   void runOnFunction() override {
-    insertOpenMPParallel(getFunction());
     if (failed(applyPatterns(getFunction())))
       signalPassFailure();
   }

diff  --git a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
index 60a143a85006b..44059a27b3295 100644
--- a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
+++ b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
@@ -21,8 +21,8 @@ func @nested_loops(%arg0: index, %arg1: index, %arg2: index,
                    %arg3: index, %arg4: index, %arg5: index) {
   // CHECK: omp.parallel {
   // CHECK: omp.wsloop (%[[LVAR_OUT1:.*]]) : index = (%arg0) to (%arg2) step (%arg4) {
-  // CHECK-NOT: omp.parallel
   scf.parallel (%i) = (%arg0) to (%arg2) step (%arg4) {
+    // CHECK: omp.parallel
     // CHECK: omp.wsloop (%[[LVAR_IN1:.*]]) : index = (%arg1) to (%arg3) step (%arg5) {
     scf.parallel (%j) = (%arg1) to (%arg3) step (%arg5) {
       // CHECK: "test.payload"(%[[LVAR_OUT1]], %[[LVAR_IN1]]) : (index, index) -> ()