[Mlir-commits] [mlir] [OpenMP][OMPIRBuilder] Add delayed privatization support for `wsloop` (PR #118463)

Kareem Ergawy llvmlistbot at llvm.org
Tue Dec 3 03:49:37 PST 2024


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/118463

>From a3085307fcce7773baad40aaba05e0d6dfd1387e Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 3 Dec 2024 00:53:18 -0600
Subject: [PATCH 1/2] [OpenMP][OMPIRBuilder] Refactor reduction initialization
 logic into one util

This refactors the logic needed to emit init logic for reductions
by moving some duplicated code into a shared util. The logic for doing
is quite involved and is needed for any construct that has reductions.
Moreover, when a construct has both private and reduction clauses, both
sets of clauses need to cooperate with each other when emitting the
logic needed for allocation and initialization. Therefore, this PR
clearly sets the boundaries for the logic needed to initialize
reductions.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 228 ++++++++----------
 .../openmp-reduction-array-sections.mlir      |   2 +-
 .../LLVMIR/openmp-reduction-sections.mlir     |   2 +-
 .../openmp-wsloop-reduction-cleanup.mlir      |   4 +-
 4 files changed, 109 insertions(+), 127 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 35b0633a04a352..9b520b10fd7c69 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1030,6 +1030,99 @@ mapInitializationArgs(T loop, LLVM::ModuleTranslation &moduleTranslation,
   }
 }
 
+template <typename OP>
+static LogicalResult
+initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
+                  llvm::IRBuilderBase &builder,
+                  LLVM::ModuleTranslation &moduleTranslation,
+                  llvm::BasicBlock *latestAllocaBlock,
+                  SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                  SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+                  DenseMap<Value, llvm::Value *> &reductionVariableMap,
+                  llvm::ArrayRef<bool> isByRef,
+                  SmallVectorImpl<DeferredStore> &deferredStores) {
+  if (op.getNumReductionVars() == 0)
+    return success();
+
+  llvm::IRBuilderBase::InsertPointGuard guard(builder);
+
+  builder.SetInsertPoint(latestAllocaBlock->getTerminator());
+  llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
+  auto allocaIP = llvm::IRBuilderBase::InsertPoint(
+      latestAllocaBlock, latestAllocaBlock->getTerminator()->getIterator());
+  builder.restoreIP(allocaIP);
+  SmallVector<llvm::Value *> byRefVars(op.getNumReductionVars());
+
+  for (unsigned i = 0; i < op.getNumReductionVars(); ++i) {
+    if (isByRef[i]) {
+      if (!reductionDecls[i].getAllocRegion().empty())
+        continue;
+
+      // TODO: remove after all users of by-ref are updated to use the alloc
+      // region: Allocate reduction variable (which is a pointer to the real
+      // reduciton variable allocated in the inlined region)
+      byRefVars[i] = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+    }
+  }
+
+  builder.SetInsertPoint(&*initBlock->getFirstNonPHIOrDbgOrAlloca());
+
+  // store result of the alloc region to the allocated pointer to the real
+  // reduction variable
+  for (auto [data, addr] : deferredStores)
+    builder.CreateStore(data, addr);
+
+  // Before the loop, store the initial values of reductions into reduction
+  // variables. Although this could be done after allocas, we don't want to mess
+  // up with the alloca insertion point.
+  for (unsigned i = 0; i < op.getNumReductionVars(); ++i) {
+    SmallVector<llvm::Value *, 1> phis;
+
+    // map block argument to initializer region
+    mapInitializationArgs(op, moduleTranslation, reductionDecls,
+                          reductionVariableMap, i);
+
+    if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
+                                       "omp.reduction.neutral", builder,
+                                       moduleTranslation, &phis)))
+      return failure();
+
+    assert(phis.size() == 1 && "expected one value to be yielded from the "
+                               "reduction neutral element declaration region");
+
+    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+
+    if (isByRef[i]) {
+      if (!reductionDecls[i].getAllocRegion().empty())
+        // done in allocReductionVars
+        continue;
+
+      // TODO: this path can be removed once all users of by-ref are updated to
+      // use an alloc region
+
+      // Store the result of the inlined region to the allocated reduction var
+      // ptr
+      builder.CreateStore(phis[0], byRefVars[i]);
+
+      privateReductionVariables[i] = byRefVars[i];
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(op.getReductionVars()[i], phis[0]);
+    } else {
+      // for by-ref case the store is inside of the reduction region
+      builder.CreateStore(phis[0], privateReductionVariables[i]);
+      // the rest was handled in allocByValReductionVars
+    }
+
+    // forget the mapping for the initializer region because we might need a
+    // different mapping if this reduction declaration is re-used for a
+    // different variable
+    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
+  }
+
+  return success();
+}
+
 /// Collect reduction info
 template <typename T>
 static void collectReductionInfo(
@@ -1183,6 +1276,7 @@ static LogicalResult allocAndInitializeReductionVars(
   if (op.getNumReductionVars() == 0)
     return success();
 
+  llvm::IRBuilderBase::InsertPointGuard guard(builder);
   SmallVector<DeferredStore> deferredStores;
 
   if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation,
@@ -1191,59 +1285,10 @@ static LogicalResult allocAndInitializeReductionVars(
                                 deferredStores, isByRef)))
     return failure();
 
-  // store result of the alloc region to the allocated pointer to the real
-  // reduction variable
-  for (auto [data, addr] : deferredStores)
-    builder.CreateStore(data, addr);
-
-  // Before the loop, store the initial values of reductions into reduction
-  // variables. Although this could be done after allocas, we don't want to mess
-  // up with the alloca insertion point.
-  for (unsigned i = 0; i < op.getNumReductionVars(); ++i) {
-    SmallVector<llvm::Value *, 1> phis;
-
-    // map block argument to initializer region
-    mapInitializationArgs(op, moduleTranslation, reductionDecls,
-                          reductionVariableMap, i);
-
-    if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
-                                       "omp.reduction.neutral", builder,
-                                       moduleTranslation, &phis)))
-      return failure();
-    assert(phis.size() == 1 && "expected one value to be yielded from the "
-                               "reduction neutral element declaration region");
-    if (isByRef[i]) {
-      if (!reductionDecls[i].getAllocRegion().empty())
-        // done in allocReductionVars
-        continue;
-
-      // TODO: this path can be removed once all users of by-ref are updated to
-      // use an alloc region
-
-      // Allocate reduction variable (which is a pointer to the real reduction
-      // variable allocated in the inlined region)
-      llvm::Value *var = builder.CreateAlloca(
-          moduleTranslation.convertType(reductionDecls[i].getType()));
-      // Store the result of the inlined region to the allocated reduction var
-      // ptr
-      builder.CreateStore(phis[0], var);
-
-      privateReductionVariables[i] = var;
-      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
-      reductionVariableMap.try_emplace(op.getReductionVars()[i], phis[0]);
-    } else {
-      // for by-ref case the store is inside of the reduction region
-      builder.CreateStore(phis[0], privateReductionVariables[i]);
-      // the rest was handled in allocByValReductionVars
-    }
-
-    // forget the mapping for the initializer region because we might need a
-    // different mapping if this reduction declaration is re-used for a
-    // different variable
-    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
-  }
-
-  return success();
+  return initReductionVars(op, reductionArgs, builder, moduleTranslation,
+                           allocaIP.getBlock(), reductionDecls,
+                           privateReductionVariables, reductionVariableMap,
+                           isByRef, deferredStores);
 }
 
 /// Allocate delayed private variables. Returns the basic block which comes
@@ -1929,6 +1974,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
       builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
     }
+
     for (auto [decl, mlirVar, llvmVar] :
          llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
       if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
@@ -1960,76 +2006,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       moduleTranslation.forgetMapping(copyRegion);
     }
 
-    // Initialize reduction vars
-    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
-    llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
-    allocaIP =
-        InsertPointTy(allocaIP.getBlock(),
-                      allocaIP.getBlock()->getTerminator()->getIterator());
-
-    builder.restoreIP(allocaIP);
-    SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
-    for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
-      if (isByRef[i]) {
-        if (!reductionDecls[i].getAllocRegion().empty())
-          continue;
-
-        // TODO: remove after all users of by-ref are updated to use the alloc
-        // region: Allocate reduction variable (which is a pointer to the real
-        // reduciton variable allocated in the inlined region)
-        byRefVars[i] = builder.CreateAlloca(
-            moduleTranslation.convertType(reductionDecls[i].getType()));
-      }
-    }
-
-    builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
-
-    // insert stores deferred until after all allocas
-    // these store the results of the alloc region into the allocation for the
-    // pointer to the reduction variable
-    for (auto [data, addr] : deferredStores)
-      builder.CreateStore(data, addr);
-
-    for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
-      SmallVector<llvm::Value *> phis;
-
-      // map the block argument
-      mapInitializationArgs(opInst, moduleTranslation, reductionDecls,
-                            reductionVariableMap, i);
-      if (failed(inlineConvertOmpRegions(
-              reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
-              builder, moduleTranslation, &phis)))
-        return llvm::createStringError(
-            "failed to inline `init` region of `omp.declare_reduction`");
-      assert(phis.size() == 1 &&
-             "expected one value to be yielded from the "
-             "reduction neutral element declaration region");
-
-      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
-
-      if (isByRef[i]) {
-        if (!reductionDecls[i].getAllocRegion().empty())
-          continue;
-
-        // TODO: remove after all users of by-ref are updated to use the alloc
-
-        // Store the result of the inlined region to the allocated reduction var
-        // ptr
-        builder.CreateStore(phis[0], byRefVars[i]);
-
-        privateReductionVariables[i] = byRefVars[i];
-        moduleTranslation.mapValue(reductionArgs[i], phis[0]);
-        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
-      } else {
-        // for by-ref case the store is inside of the reduction init region
-        builder.CreateStore(phis[0], privateReductionVariables[i]);
-        // the rest is done in allocByValReductionVars
-      }
-
-      // clear block argument mapping in case it needs to be re-created with a
-      // different source for another use of the same reduction decl
-      moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
-    }
+    if (failed(
+            initReductionVars(opInst, reductionArgs, builder, moduleTranslation,
+                              afterAllocas.get()->getSinglePredecessor(),
+                              reductionDecls, privateReductionVariables,
+                              reductionVariableMap, isByRef, deferredStores)))
+      return llvm::make_error<PreviouslyReportedError>();
 
     // Store the mapping between reduction variables and their private copies on
     // ModuleTranslation stack. It can be then recovered when translating
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index 5a506310653c83..fdfcc66b91012d 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -91,12 +91,12 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         %[[VAL_14:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_15:.*]]
 // CHECK:       omp.reduction.init:                               ; preds = %[[VAL_16:.*]]
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_17:.*]]
 // CHECK:       omp.par.region:                                   ; preds = %[[VAL_15]]
 // CHECK:         br label %[[VAL_18:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_17]]
 // CHECK:         %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
-// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_18]]
 // CHECK:         store i32 0, ptr %[[VAL_7]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
index db9a314b1f5a3c..ed7e9fada5fc44 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -51,11 +51,11 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:         %[[VAL_21:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp.reduction.init:                               ; preds = %[[VAL_23:.*]]
+// CHECK:         store float 0.000000e+00, ptr %[[VAL_20]], align 4
 // CHECK:         br label %[[VAL_24:.*]]
 // CHECK:       omp.par.region:                                   ; preds = %[[VAL_22]]
 // CHECK:         br label %[[VAL_25:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_24]]
-// CHECK:         store float 0.000000e+00, ptr %[[VAL_20]], align 4
 // CHECK:         br label %[[VAL_26:.*]]
 // CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_25]]
 // CHECK:         store i32 0, ptr %[[VAL_13]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
index 1a5065fec026ea..7dde3846da6495 100644
--- a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
@@ -51,11 +51,11 @@
   llvm.func @free(%arg0 : !llvm.ptr) -> ()
 
 // Private reduction variable and its initialization.
-// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
 // CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
+// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
+// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
 // CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
 // CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
-// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
 // CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
 
 // Call to the reduction function.

>From eac2adcb0129526ee8d2a9b0080ce266e137856d Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 2 Dec 2024 05:17:34 -0600
Subject: [PATCH 2/2] [OpenMP][OMPIRBuilder] Add delayed privatization support
 for `wsloop`

Extend MLIR to LLVM lowering by adding support for `omp.wsloop` for
delayed privatization. This also refactors a few bit of code to isolate
the logic needed for `firstprivate` initialization in a shared util that
can be used across constructs that need it.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 261 ++++++++++--------
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  19 --
 .../Target/LLVMIR/openmp-wsloop-private.mlir  |  88 ++++++
 3 files changed, 228 insertions(+), 140 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-wsloop-private.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9b520b10fd7c69..6b42af0158c6db 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -268,7 +268,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkAllocate(op, result);
         checkLinear(op, result);
         checkOrder(op, result);
-        checkPrivate(op, result);
       })
       .Case([&](omp::ParallelOp op) { checkAllocate(op, result); })
       .Case([&](omp::SimdOp op) {
@@ -1302,6 +1301,7 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
                     MutableArrayRef<mlir::Value> mlirPrivateVars,
                     llvm::SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
                     const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) {
+  llvm::IRBuilderBase::InsertPointGuard guard(builder);
   // Allocate private vars
   llvm::BranchInst *allocaTerminator =
       llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
@@ -1363,6 +1363,84 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
   return afterAllocas;
 }
 
+static LogicalResult
+initFirstPrivateVars(llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation,
+                     SmallVectorImpl<mlir::Value> &mlirPrivateVars,
+                     SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
+                     SmallVectorImpl<omp::PrivateClauseOp> &privateDecls,
+                     llvm::BasicBlock *afterAllocas) {
+  llvm::IRBuilderBase::InsertPointGuard guard(builder);
+  // Apply copy region for firstprivate.
+  bool needsFirstprivate =
+      llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
+        return privOp.getDataSharingType() ==
+               omp::DataSharingClauseType::FirstPrivate;
+      });
+
+  if (needsFirstprivate) {
+    // Find the end of the allocation blocks
+    builder.SetInsertPoint(
+        afterAllocas->getSinglePredecessor()->getTerminator());
+    llvm::BasicBlock *copyBlock =
+        splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+    builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
+  }
+
+  for (auto [decl, mlirVar, llvmVar] :
+       llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
+    if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
+      continue;
+
+    // copyRegion implements `lhs = rhs`
+    Region &copyRegion = decl.getCopyRegion();
+
+    // map copyRegion rhs arg
+    llvm::Value *nonPrivateVar = moduleTranslation.lookupValue(mlirVar);
+    assert(nonPrivateVar);
+    moduleTranslation.mapValue(decl.getCopyMoldArg(), nonPrivateVar);
+
+    // map copyRegion lhs arg
+    moduleTranslation.mapValue(decl.getCopyPrivateArg(), llvmVar);
+
+    // in-place convert copy region
+    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+    if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy", builder,
+                                       moduleTranslation)))
+      return decl.emitError("failed to inline `copy` region of `omp.private`");
+
+    // ignore unused value yielded from copy region
+
+    // clear copy region block argument mapping in case it needs to be
+    // re-created with different sources for reuse of the same reduction
+    // decl
+    moduleTranslation.forgetMapping(copyRegion);
+  }
+
+  return success();
+}
+
+static LogicalResult
+cleanupPrivateVars(llvm::IRBuilderBase &builder,
+                   LLVM::ModuleTranslation &moduleTranslation, Location loc,
+                   SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
+                   SmallVectorImpl<omp::PrivateClauseOp> &privateDecls) {
+  // private variable deallocation
+  SmallVector<Region *> privateCleanupRegions;
+  llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
+                  [](omp::PrivateClauseOp privatizer) {
+                    return &privatizer.getDeallocRegion();
+                  });
+
+  if (failed(inlineOmpRegionCleanup(
+          privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
+          "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
+    return mlir::emitError(loc, "failed to inline `dealloc` region of an "
+                                "`omp.private` op in an omp.task");
+
+  return success();
+}
+
 static LogicalResult
 convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
@@ -1622,50 +1700,10 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
     if (handleError(afterAllocas, *taskOp).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
-    // Apply copy region for firstprivate
-    bool needsFirstPrivate =
-        llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
-          return privOp.getDataSharingType() ==
-                 omp::DataSharingClauseType::FirstPrivate;
-        });
-    if (needsFirstPrivate) {
-      // Find the end of the allocation blocks
-      assert(afterAllocas.get()->getSinglePredecessor());
-      builder.SetInsertPoint(
-          afterAllocas.get()->getSinglePredecessor()->getTerminator());
-      llvm::BasicBlock *copyBlock =
-          splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
-      builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
-    }
-    for (auto [decl, mlirVar, llvmVar] :
-         llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
-      if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
-        continue;
-
-      // copyRegion implements `lhs = rhs`
-      Region &copyRegion = decl.getCopyRegion();
-
-      // map copyRegion rhs arg
-      llvm::Value *nonPrivateVar = moduleTranslation.lookupValue(mlirVar);
-      assert(nonPrivateVar);
-      moduleTranslation.mapValue(decl.getCopyMoldArg(), nonPrivateVar);
-
-      // map copyRegion lhs arg
-      moduleTranslation.mapValue(decl.getCopyPrivateArg(), llvmVar);
-
-      // in-place convert copy region
-      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
-      if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
-                                         builder, moduleTranslation)))
-        return llvm::createStringError(
-            "failed to inline `copy` region of an `omp.private` op in taskOp");
-
-      // ignore unused value yielded from copy region
-
-      // clear copy region block argument mapping in case it needs to be
-      // re-created with different source for reuse of the same reduction decl
-      moduleTranslation.forgetMapping(copyRegion);
-    }
+    if (failed(initFirstPrivateVars(builder, moduleTranslation, mlirPrivateVars,
+                                    llvmPrivateVars, privateDecls,
+                                    afterAllocas.get())))
+      return llvm::make_error<PreviouslyReportedError>();
 
     // translate the body of the task:
     builder.restoreIP(codegenIP);
@@ -1674,19 +1712,11 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
     if (failed(handleError(continuationBlockOrError, *taskOp)))
       return llvm::make_error<PreviouslyReportedError>();
 
-    // private variable deallocation
-    SmallVector<Region *> privateCleanupRegions;
-    llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
-                    [](omp::PrivateClauseOp privatizer) {
-                      return &privatizer.getDeallocRegion();
-                    });
-
     builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
-    if (failed(inlineOmpRegionCleanup(
-            privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
-            "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
-      return llvm::createStringError("failed to inline `dealloc` region of an "
-                                     "`omp.private` op in an omp.task");
+
+    if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(),
+                                  llvmPrivateVars, privateDecls)))
+      return llvm::make_error<PreviouslyReportedError>();
 
     return llvm::Error::success();
   };
@@ -1777,6 +1807,18 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
   }
 
+  MutableArrayRef<BlockArgument> privateBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(*wsloopOp).getPrivateBlockArgs();
+  SmallVector<mlir::Value> mlirPrivateVars;
+  SmallVector<llvm::Value *> llvmPrivateVars;
+  SmallVector<omp::PrivateClauseOp> privateDecls;
+  mlirPrivateVars.reserve(privateBlockArgs.size());
+  llvmPrivateVars.reserve(privateBlockArgs.size());
+  collectPrivatizationDecls(wsloopOp, privateDecls);
+
+  for (mlir::Value privateVar : wsloopOp.getPrivateVars())
+    mlirPrivateVars.push_back(privateVar);
+
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(wsloopOp, reductionDecls);
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
@@ -1784,15 +1826,37 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
 
   SmallVector<llvm::Value *> privateReductionVariables(
       wsloopOp.getNumReductionVars());
+
+  llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
+      builder, moduleTranslation, privateBlockArgs, privateDecls,
+      mlirPrivateVars, llvmPrivateVars, allocaIP);
+  if (handleError(afterAllocas, opInst).failed())
+    return failure();
+
   DenseMap<Value, llvm::Value *> reductionVariableMap;
 
   MutableArrayRef<BlockArgument> reductionArgs =
       cast<omp::BlockArgOpenMPOpInterface>(opInst).getReductionBlockArgs();
 
-  if (failed(allocAndInitializeReductionVars(
-          wsloopOp, reductionArgs, builder, moduleTranslation, allocaIP,
-          reductionDecls, privateReductionVariables, reductionVariableMap,
-          isByRef)))
+  SmallVector<DeferredStore> deferredStores;
+
+  if (failed(allocReductionVars(wsloopOp, reductionArgs, builder,
+                                moduleTranslation, allocaIP, reductionDecls,
+                                privateReductionVariables, reductionVariableMap,
+                                deferredStores, isByRef)))
+    return failure();
+
+  if (failed(initFirstPrivateVars(builder, moduleTranslation, mlirPrivateVars,
+                                  llvmPrivateVars, privateDecls,
+                                  afterAllocas.get())))
+    return failure();
+
+  assert(afterAllocas.get()->getSinglePredecessor());
+  if (failed(initReductionVars(wsloopOp, reductionArgs, builder,
+                               moduleTranslation,
+                               afterAllocas.get()->getSinglePredecessor(),
+                               reductionDecls, privateReductionVariables,
+                               reductionVariableMap, isByRef, deferredStores)))
     return failure();
 
   // TODO: Replace this with proper composite translation support.
@@ -1899,9 +1963,13 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.restoreIP(afterIP);
 
   // Process the reductions if required.
-  return createReductionsAndCleanup(wsloopOp, builder, moduleTranslation,
-                                    allocaIP, reductionDecls,
-                                    privateReductionVariables, isByRef);
+  if (failed(createReductionsAndCleanup(wsloopOp, builder, moduleTranslation,
+                                        allocaIP, reductionDecls,
+                                        privateReductionVariables, isByRef)))
+    return failure();
+
+  return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(),
+                            llvmPrivateVars, privateDecls);
 }
 
 /// Converts the OpenMP parallel operation to LLVM IR.
@@ -1959,53 +2027,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             deferredStores, isByRef)))
       return llvm::make_error<PreviouslyReportedError>();
 
-    // Apply copy region for firstprivate.
-    bool needsFirstprivate =
-        llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
-          return privOp.getDataSharingType() ==
-                 omp::DataSharingClauseType::FirstPrivate;
-        });
-    if (needsFirstprivate) {
-      // Find the end of the allocation blocks
-      assert(afterAllocas.get()->getSinglePredecessor());
-      builder.SetInsertPoint(
-          afterAllocas.get()->getSinglePredecessor()->getTerminator());
-      llvm::BasicBlock *copyBlock =
-          splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
-      builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
-    }
-
-    for (auto [decl, mlirVar, llvmVar] :
-         llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
-      if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
-        continue;
-
-      // copyRegion implements `lhs = rhs`
-      Region &copyRegion = decl.getCopyRegion();
-
-      // map copyRegion rhs arg
-      llvm::Value *nonPrivateVar = moduleTranslation.lookupValue(mlirVar);
-      assert(nonPrivateVar);
-      moduleTranslation.mapValue(decl.getCopyMoldArg(), nonPrivateVar);
-
-      // map copyRegion lhs arg
-      moduleTranslation.mapValue(decl.getCopyPrivateArg(), llvmVar);
-
-      // in-place convert copy region
-      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
-      if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
-                                         builder, moduleTranslation)))
-        return llvm::createStringError(
-            "failed to inline `copy` region of `omp.private`");
-
-      // ignore unused value yielded from copy region
-
-      // clear copy region block argument mapping in case it needs to be
-      // re-created with different sources for reuse of the same reduction
-      // decl
-      moduleTranslation.forgetMapping(copyRegion);
-    }
+    if (failed(initFirstPrivateVars(builder, moduleTranslation, mlirPrivateVars,
+                                    llvmPrivateVars, privateDecls,
+                                    afterAllocas.get())))
+      return llvm::make_error<PreviouslyReportedError>();
 
+    assert(afterAllocas.get()->getSinglePredecessor());
     if (failed(
             initReductionVars(opInst, reductionArgs, builder, moduleTranslation,
                               afterAllocas.get()->getSinglePredecessor(),
@@ -2090,17 +2117,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       return llvm::createStringError(
           "failed to inline `cleanup` region of `omp.declare_reduction`");
 
-    SmallVector<Region *> privateCleanupRegions;
-    llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
-                    [](omp::PrivateClauseOp privatizer) {
-                      return &privatizer.getDeallocRegion();
-                    });
-
-    if (failed(inlineOmpRegionCleanup(
-            privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
-            "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
-      return llvm::createStringError(
-          "failed to inline `dealloc` region of `omp.private`");
+    if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst.getLoc(),
+                                  llvmPrivateVars, privateDecls)))
+      return llvm::make_error<PreviouslyReportedError>();
 
     builder.restoreIP(oldIP);
     return llvm::Error::success();
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index de797ea2aa3649..c1e0014d1f571f 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -635,22 +635,3 @@ llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) {
   }
   llvm.return
 }
-
-// -----
-
-omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
-^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
-  omp.yield(%1 : !llvm.ptr)
-}
-llvm.func @wsloop_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error at below {{not yet implemented: Unhandled clause privatization in omp.wsloop operation}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.wsloop}}
-  omp.wsloop private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
-    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-private.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-private.mlir
new file mode 100644
index 00000000000000..db67bb5fae58b0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-private.mlir
@@ -0,0 +1,88 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// tests a wsloop private + firstprivate + reduction to make sure block structure
+// is handled properly.
+
+omp.private {type = private} @_QFwsloop_privateEi_private_ref_i32 : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+llvm.func @foo_free(!llvm.ptr)
+
+omp.private {type = firstprivate} @_QFwsloop_privateEc_firstprivate_ref_c8 : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.array<1 x i8> {bindc_name = "c", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> !llvm.array<1 x i8>
+  llvm.store %0, %arg1 : !llvm.array<1 x i8>, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+} dealloc {
+^bb0(%arg0: !llvm.ptr):
+  llvm.call @foo_free(%arg0) : (!llvm.ptr) -> ()
+  omp.yield
+}
+
+omp.declare_reduction @max_f32 : f32 init {
+^bb0(%arg0: f32):
+  %0 = llvm.mlir.constant(-3.40282347E+38 : f32) : f32
+  omp.yield(%0 : f32)
+} combiner {
+^bb0(%arg0: f32, %arg1: f32):
+  %0 = llvm.intr.maxnum(%arg0, %arg1) {fastmathFlags = #llvm.fastmath<contract>} : (f32, f32) -> f32
+  omp.yield(%0 : f32)
+}
+
+llvm.func @wsloop_private_(%arg0: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPwsloop_private", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %5 = llvm.alloca %0 x !llvm.array<1 x i8> {bindc_name = "c"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(1 : i32) : i32
+  %7 = llvm.mlir.constant(10 : i32) : i32
+  %8 = llvm.mlir.constant(0 : i32) : i32
+  omp.parallel {
+    omp.wsloop private(@_QFwsloop_privateEc_firstprivate_ref_c8 %5 -> %arg1, @_QFwsloop_privateEi_private_ref_i32 %3 -> %arg2 : !llvm.ptr, !llvm.ptr) reduction(@max_f32 %1 -> %arg3 : !llvm.ptr) {
+      omp.loop_nest (%arg4) : i32 = (%8) to (%7) inclusive step (%6) {
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK: call void {{.*}} @__kmpc_fork_call(ptr @1, i32 1, ptr @[[OUTLINED:.*]], ptr %{{.*}})
+
+// CHECK: define internal void @[[OUTLINED:.*]]{{.*}} {
+
+// First, check that all memory for privates and reductions is allocated.
+// CHECK: omp.par.entry:
+// CHECK:   %[[CHR:.*]] = alloca [1 x i8], i64 1, align 1
+// CHECK:   %[[INT:.*]] = alloca i32, i64 1, align 4
+// CHECK:   %[[FLT:.*]] = alloca float, align 4
+// CHECK:   %[[RED_ARR:.*]] = alloca [1 x ptr], align 8
+// CHECK:   br label %[[LATE_ALLOC_BB:.*]]
+
+// CHECK: [[LATE_ALLOC_BB]]:
+// CHECK:   br label %[[PRIVATE_CPY_BB:.*]]
+
+// Second, check that first private was properly copied.
+// CHECK: [[PRIVATE_CPY_BB:.*]]:
+// CHECK:   %[[CHR_VAL:.*]] = load [1 x i8], ptr %{{.*}}, align 1
+// CHECK:   store [1 x i8] %[[CHR_VAL]], ptr %[[CHR]], align 1
+// CHECK:   br label %[[RED_INIT_BB:.*]]
+
+// Third, check that reduction init took place.
+// CHECK: [[RED_INIT_BB]]:
+// CHECK:   store float 0x{{.*}}, ptr %[[FLT]], align 4
+
+// Finally, check for the private dealloc region
+// CHECK:   call void @foo_free(ptr %[[CHR]])
+
+// CHECK: }



More information about the Mlir-commits mailing list