[Mlir-commits] [llvm] [mlir] [MLIR][OpenMP] Support basic materialization for `omp.private` ops (PR #81715)

Kareem Ergawy llvmlistbot at llvm.org
Mon Feb 19 21:22:41 PST 2024


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/81715

>From 16e86b947f7c00731035755b0c0bc0e3afe09050 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 13 Feb 2024 04:46:50 -0600
Subject: [PATCH 1/3] [MLIR][OpenMP] Support basic materialization for
 `omp.private` ops

Adds basic support for materializing delayed privatization. So far, the
restrictions on the implementation are:
- Only `private` clauses are supported (`firstprivate` support will be
  added in a later PR).
- Only single-block `omp.private -> alloc` regions are supported
  (multi-block ones will be supported in a later PR).
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 155 +++++++++++++++---
 mlir/test/Target/LLVMIR/openmp-private.mlir   |  91 ++++++++++
 2 files changed, 227 insertions(+), 19 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-private.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 78a2ad76a1e3b8..484fd7e6f6c46d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1000,11 +1000,39 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Replace the region arguments of the parallel op (which correspond to private
+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder. Instead of editing the original op in-place, this function
+/// does the required changes to a cloned version which should then be erased by
+/// the caller.
+static omp::ParallelOp
+prepareOmpParallelForPrivatization(omp::ParallelOp opInst) {
+  mlir::OpBuilder cloneBuilder(opInst);
+  omp::ParallelOp opInstClone =
+      llvm::cast<omp::ParallelOp>(cloneBuilder.clone(*opInst));
+
+  Region &region = opInstClone.getRegion();
+  auto privateVars = opInstClone.getPrivateVars();
+
+  auto privateVarsIt = privateVars.begin();
+  // Reduction precede private arguments, so skip them first.
+  unsigned privateArgBeginIdx = opInstClone.getNumReductionVars();
+  unsigned privateArgEndIdx = privateArgBeginIdx + privateVars.size();
+  for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+       ++argIdx, ++privateVarsIt)
+    replaceAllUsesInRegionWith(region.getArgument(argIdx), *privateVarsIt,
+                               region);
+  return opInstClone;
+}
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  omp::ParallelOp opInstClone = prepareOmpParallelForPrivatization(opInst);
+
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
@@ -1013,12 +1041,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Collect reduction declarations
     SmallVector<omp::ReductionDeclareOp> reductionDecls;
-    collectReductionDecls(opInst, reductionDecls);
+    collectReductionDecls(opInstClone, reductionDecls);
 
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
+    allocReductionVars(opInstClone, builder, moduleTranslation, allocaIP,
                        reductionDecls, privateReductionVariables,
                        reductionVariableMap);
 
@@ -1030,7 +1058,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
-    for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
+    for (unsigned i = 0; i < opInstClone.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
       if (failed(inlineConvertOmpRegions(
               reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
@@ -1051,18 +1079,19 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // ParallelOp has only one region associated with it.
     builder.restoreIP(codeGenIP);
     auto regionBlock =
-        convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder,
+        convertOmpOpRegions(opInstClone.getRegion(), "omp.par.region", builder,
                             moduleTranslation, bodyGenStatus);
 
     // Process the reductions if required.
-    if (opInst.getNumReductionVars() > 0) {
+    if (opInstClone.getNumReductionVars() > 0) {
       // Collect reduction info
       SmallVector<OwningReductionGen> owningReductionGens;
       SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
       SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
-      collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls,
-                           owningReductionGens, owningAtomicReductionGens,
-                           privateReductionVariables, reductionInfos);
+      collectReductionInfo(opInstClone, builder, moduleTranslation,
+                           reductionDecls, owningReductionGens,
+                           owningAtomicReductionGens, privateReductionVariables,
+                           reductionInfos);
 
       // Move to region cont block
       builder.SetInsertPoint(regionBlock->getTerminator());
@@ -1075,7 +1104,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           ompBuilder->createReductions(builder.saveIP(), allocaIP,
                                        reductionInfos, false);
       if (!contInsertPoint.getBlock()) {
-        bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";
+        bodyGenStatus = opInstClone->emitOpError()
+                        << "failed to convert reductions";
         return;
       }
 
@@ -1086,12 +1116,97 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently defaults to shared.
+  // Currently shared and private are supported.
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar, privatizerClone] =
+        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInstClone.getPrivateVars().empty()) {
+        auto privVars = opInstClone.getPrivateVars();
+        auto privatizers = opInstClone.getPrivatizers();
+
+        for (auto [privVar, privatizerAttr] :
+             llvm::zip_equal(privVars, *privatizers)) {
+          // Find the MLIR private variable corresponding to the LLVM value
+          // being privatized.
+          llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
+          if (llvmPrivVar != &vPtr)
+            continue;
+
+          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
+          omp::PrivateClauseOp privatizer =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInstClone, privSym);
+
+          // Clone the privatizer in case it used by more than one parallel
+          // region. The privatizer is processed in-place (see below) before it
+          // gets inlined in the parallel region and therefore processing the
+          // original op is dangerous.
+          return {privVar, privatizer.clone()};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+      if (privatizerClone.getDataSharingType() ==
+          omp::DataSharingClauseType::FirstPrivate) {
+        privatizerClone.emitOpError(
+            "TODO: delayed privatization is not "
+            "supported for `firstprivate` clauses yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      Region &allocRegion = privatizerClone.getAllocRegion();
+
+      if (!allocRegion.hasOneBlock()) {
+        privatizerClone.emitOpError(
+            "TODO: multi-block alloc regions are not supported yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      auto allocRegionArg = allocRegion.getArgument(0);
+      replaceAllUsesInRegionWith(allocRegionArg, privVar, allocRegion);
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      // Temporarily unlink the terminator from its parent since
+      // `inlineConvertOmpRegions` expects the insertion block to **not**
+      // contain a terminator.
+      llvm::Instruction &allocaTerminator = builder.GetInsertBlock()->back();
+      assert(allocaTerminator.isTerminator());
+      allocaTerminator.removeFromParent();
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        opInstClone.emitError(
+            "failed to inline `alloc` region of an `omp.private` "
+            "op in the parallel region");
+        bodyGenStatus = failure();
+      } else {
+        assert(yieldedValues.size() == 1);
+        replacementValue = yieldedValues.front();
+      }
+
+      allocaTerminator.insertAfter(&builder.GetInsertBlock()->back());
+      privatizerClone.erase();
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -1100,13 +1215,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto finiCB = [&](InsertPointTy codeGenIP) {};
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = opInst.getIfExprVar())
+  if (auto ifExprVar = opInstClone.getIfExprVar())
     ifCond = moduleTranslation.lookupValue(ifExprVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = opInst.getNumThreadsVar())
+  if (auto numThreadsVar = opInstClone.getNumThreadsVar())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = opInst.getProcBindVal())
+  if (auto bind = opInstClone.getProcBindVal())
     pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
@@ -1119,6 +1234,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
                                  ifCond, numThreads, pbKind, isCancellable));
 
+  opInstClone.erase();
   return bodyGenStatus;
 }
 
@@ -3009,12 +3125,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
new file mode 100644
index 00000000000000..1ac87852d5300f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -0,0 +1,91 @@
+// Test code-gen for `omp.parallel` ops with delayed privatizers (i.e. using
+// `omp.private` ops).
+
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_1_private
+// CHECK-SAME: (ptr %[[ORIG:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_1_private..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
+// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
+// CHECK: }
+
+llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> i32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_2_privates
+// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_2_privates..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
+// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
+// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
+// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
+// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
+// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
+// CHECK: }
+
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %1, %0 : f32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %1, %0 : i32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}

>From 363ae7f6afdd61f6f350eef7c38d325a0bec7170 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 19 Feb 2024 23:20:41 -0600
Subject: [PATCH 2/3] [to-be-reverted] Add logs to OMPIRBuilder.

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  1 +
 mlir/test/Target/LLVMIR/openmp-private.mlir   | 72 +------------------
 3 files changed, 6 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..d935d5ae12f7ec 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1501,6 +1501,10 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
   Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
 
+  for (auto* Input : Inputs) {
+    llvm::errs() << ">>>> collected input: " << *Input << "\n";
+  }
+
   LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
 
   FunctionCallee TIDRTLFn =
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 484fd7e6f6c46d..b807ce7a8cfd4a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1120,6 +1120,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
+    llvm::errs() << ">>>> calling privCB for: " << vPtr << "\n";
     replacementValue = &vPtr;
 
     // If this is a private value, this lambda will return the corresponding
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 1ac87852d5300f..ddc369bc4c7e5c 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -5,73 +5,12 @@
 
 llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
   omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
-    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+//    %0 = llvm.load %arg2 : !llvm.ptr -> f32
     omp.terminator
   }
   llvm.return
 }
 
-// CHECK-LABEL: @parallel_op_1_private
-// CHECK-SAME: (ptr %[[ORIG:.*]]) {
-// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
-// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
-// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
-// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
-// CHECK: }
-
-// CHECK-LABEL: void @parallel_op_1_private..omp_par
-// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
-// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
-// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
-
-// Check that the privatizer alloc region was inlined properly.
-// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
-// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
-// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
-// CHECK-NEXT: br
-
-// Check that the privatized value is used (rather than the original one).
-// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
-// CHECK: }
-
-llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
-    %0 = llvm.load %arg2 : !llvm.ptr -> f32
-    %1 = llvm.load %arg3 : !llvm.ptr -> i32
-    omp.terminator
-  }
-  llvm.return
-}
-
-// CHECK-LABEL: @parallel_op_2_privates
-// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
-// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
-// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
-// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
-// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
-// CHECK: }
-
-// CHECK-LABEL: void @parallel_op_2_privates..omp_par
-// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
-// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
-// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
-// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
-// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
-
-// Check that the privatizer alloc region was inlined properly.
-// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
-// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
-// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
-// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
-// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
-// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
-// CHECK-NEXT: br
-
-// Check that the privatized value is used (rather than the original one).
-// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
-// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
-// CHECK: }
-
 omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
 ^bb0(%arg0: !llvm.ptr):
   %c1 = llvm.mlir.constant(1 : i32) : i32
@@ -80,12 +19,3 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   llvm.store %1, %0 : f32, !llvm.ptr
   omp.yield(%0 : !llvm.ptr)
 }
-
-omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
-^bb0(%arg0: !llvm.ptr):
-  %c1 = llvm.mlir.constant(1 : i32) : i32
-  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  %1 = llvm.load %arg0 : !llvm.ptr -> i32
-  llvm.store %1, %0 : i32, !llvm.ptr
-  omp.yield(%0 : !llvm.ptr)
-}

>From 3933d26d1a8a3cb645d6e82849e3d3bb5a4c2936 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 19 Feb 2024 23:21:54 -0600
Subject: [PATCH 3/3] [to-be-reverted] Do not re-map parallel op's arguments

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index b807ce7a8cfd4a..b5551dabbef8d4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1031,7 +1031,7 @@ static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  omp::ParallelOp opInstClone = prepareOmpParallelForPrivatization(opInst);
+  //omp::ParallelOp opInstClone = prepareOmpParallelForPrivatization(opInst);
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
@@ -1041,12 +1041,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Collect reduction declarations
     SmallVector<omp::ReductionDeclareOp> reductionDecls;
-    collectReductionDecls(opInstClone, reductionDecls);
+    collectReductionDecls(opInst, reductionDecls);
 
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInstClone, builder, moduleTranslation, allocaIP,
+    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
                        reductionDecls, privateReductionVariables,
                        reductionVariableMap);
 
@@ -1058,7 +1058,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
-    for (unsigned i = 0; i < opInstClone.getNumReductionVars(); ++i) {
+    for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
       if (failed(inlineConvertOmpRegions(
               reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
@@ -1079,16 +1079,16 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // ParallelOp has only one region associated with it.
     builder.restoreIP(codeGenIP);
     auto regionBlock =
-        convertOmpOpRegions(opInstClone.getRegion(), "omp.par.region", builder,
+        convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder,
                             moduleTranslation, bodyGenStatus);
 
     // Process the reductions if required.
-    if (opInstClone.getNumReductionVars() > 0) {
+    if (opInst.getNumReductionVars() > 0) {
       // Collect reduction info
       SmallVector<OwningReductionGen> owningReductionGens;
       SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
       SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
-      collectReductionInfo(opInstClone, builder, moduleTranslation,
+      collectReductionInfo(opInst, builder, moduleTranslation,
                            reductionDecls, owningReductionGens,
                            owningAtomicReductionGens, privateReductionVariables,
                            reductionInfos);
@@ -1104,7 +1104,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           ompBuilder->createReductions(builder.saveIP(), allocaIP,
                                        reductionInfos, false);
       if (!contInsertPoint.getBlock()) {
-        bodyGenStatus = opInstClone->emitOpError()
+        bodyGenStatus = opInst->emitOpError()
                         << "failed to convert reductions";
         return;
       }
@@ -1128,9 +1128,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // returned.
     auto [privVar, privatizerClone] =
         [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
-      if (!opInstClone.getPrivateVars().empty()) {
-        auto privVars = opInstClone.getPrivateVars();
-        auto privatizers = opInstClone.getPrivatizers();
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privatizers = opInst.getPrivatizers();
 
         for (auto [privVar, privatizerAttr] :
              llvm::zip_equal(privVars, *privatizers)) {
@@ -1143,7 +1143,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
           omp::PrivateClauseOp privatizer =
               SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
-                  opInstClone, privSym);
+                  opInst, privSym);
 
           // Clone the privatizer in case it used by more than one parallel
           // region. The privatizer is processed in-place (see below) before it
@@ -1194,7 +1194,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       SmallVector<llvm::Value *, 1> yieldedValues;
       if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
                                          moduleTranslation, &yieldedValues))) {
-        opInstClone.emitError(
+        opInst.emitError(
             "failed to inline `alloc` region of an `omp.private` "
             "op in the parallel region");
         bodyGenStatus = failure();
@@ -1216,13 +1216,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto finiCB = [&](InsertPointTy codeGenIP) {};
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = opInstClone.getIfExprVar())
+  if (auto ifExprVar = opInst.getIfExprVar())
     ifCond = moduleTranslation.lookupValue(ifExprVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = opInstClone.getNumThreadsVar())
+  if (auto numThreadsVar = opInst.getNumThreadsVar())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = opInstClone.getProcBindVal())
+  if (auto bind = opInst.getProcBindVal())
     pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
@@ -1235,7 +1235,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
                                  ifCond, numThreads, pbKind, isCancellable));
 
-  opInstClone.erase();
+  opInst.erase();
   return bodyGenStatus;
 }
 



More information about the Mlir-commits mailing list