[flang-commits] [mlir] [flang] [WIP] Delayed privatization. (PR #79862)

Kareem Ergawy via flang-commits flang-commits at lists.llvm.org
Sun Feb 4 04:46:00 PST 2024


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/79862

>From 59bd1925fb97ec708e925c3604bb9edb5e446f0e Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 29 Jan 2024 04:45:18 -0600
Subject: [PATCH 1/6] [WIP] Delayed privatization.

This is a PoC for delayed privatization in OpenMP. Instead of directly
emitting privatization code in the frontend, we add a new op to outline
the privatization logic for a symbol and call-like mapping that maps
from the host symbol to an outlined function-like privatizer op.

Later, we would inline the delayed privatizer function-like op in the
OpenMP region to basically get the same code generated directly by the
fronend at the moment.
---
 flang/include/flang/Lower/AbstractConverter.h |   4 +
 flang/lib/Lower/Bridge.cpp                    |   2 +-
 flang/lib/Lower/OpenMP.cpp                    | 132 ++++++++++++++----
 .../OpenMP/FIR/delayed_privatization.f90      |  43 ++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  43 +++++-
 .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp    |   4 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  92 +++++++++++-
 mlir/test/Dialect/OpenMP/roundtrip.mlir       |  36 +++++
 8 files changed, 326 insertions(+), 30 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
 create mode 100644 mlir/test/Dialect/OpenMP/roundtrip.mlir

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 796933a4eb5f6..55bc33e76e5f6 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -16,6 +16,7 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Lower/LoweringOptions.h"
 #include "flang/Lower/PFTDefs.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Semantics/symbol.h"
 #include "mlir/IR/Builders.h"
@@ -296,6 +297,9 @@ class AbstractConverter {
     return loweringOptions;
   }
 
+  virtual Fortran::lower::SymbolBox
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) = 0;
+
 private:
   /// Options controlling lowering behavior.
   const Fortran::lower::LoweringOptions &loweringOptions;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 4c8e0cb128744..f511212fd34f5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1070,7 +1070,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Find the symbol in one level up of symbol map such as for host-association
   /// in OpenMP code or return null.
   Fortran::lower::SymbolBox
-  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) {
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) override {
     if (Fortran::lower::SymbolBox v = localSymbols.lookupOneLevelUpSymbol(sym))
       return v;
     return {};
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index be2117efbabc0..4d012c45108fd 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -161,6 +161,11 @@ class DataSharingProcessor {
   const Fortran::parser::OmpClauseList &opClauseList;
   Fortran::lower::pft::Evaluation &eval;
 
+  bool useDelayedPrivatizationWhenPossible;
+  Fortran::lower::SymMap *symTable;
+  llvm::SetVector<mlir::SymbolRefAttr> privateInitializers;
+  llvm::SetVector<mlir::Value> privateSymHostAddrsses;
+
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
   void collectOmpObjectListSymbol(
@@ -182,10 +187,14 @@ class DataSharingProcessor {
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
                        const Fortran::parser::OmpClauseList &opClauseList,
-                       Fortran::lower::pft::Evaluation &eval)
+                       Fortran::lower::pft::Evaluation &eval,
+                       bool useDelayedPrivatizationWhenPossible = false,
+                       Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval) {}
+        eval(eval), useDelayedPrivatizationWhenPossible(
+                        useDelayedPrivatizationWhenPossible),
+        symTable(symTable) {}
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
   // firstprivates. Step1 is performed at the place where process/processStep1
@@ -204,6 +213,14 @@ class DataSharingProcessor {
     assert(!loopIV && "Loop iteration variable already set");
     loopIV = iv;
   }
+
+  const llvm::SetVector<mlir::SymbolRefAttr> &getPrivateInitializers() const {
+    return privateInitializers;
+  };
+
+  const llvm::SetVector<mlir::Value> &getPrivateSymHostAddrsses() const {
+    return privateSymHostAddrsses;
+  }
 };
 
 void DataSharingProcessor::processStep1() {
@@ -496,8 +513,46 @@ void DataSharingProcessor::privatize() {
         copyFirstPrivateSymbol(&*mem);
       }
     } else {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      if (useDelayedPrivatizationWhenPossible) {
+        auto ip = firOpBuilder.saveInsertionPoint();
+
+        auto moduleOp = firOpBuilder.getInsertionBlock()
+                            ->getParentOp()
+                            ->getParentOfType<mlir::ModuleOp>();
+
+        firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
+                                       moduleOp.getBodyRegion().front().end());
+
+        Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+        assert(hsb && "Host symbol box not found");
+
+        auto symType = hsb.getAddr().getType();
+        auto symLoc = hsb.getAddr().getLoc();
+        auto privatizerOp = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
+            symLoc, symType, sym->name().ToString());
+        firOpBuilder.setInsertionPointToEnd(&privatizerOp.getBody().front());
+
+        symTable->pushScope();
+        symTable->addSymbol(*sym, privatizerOp.getArgument(0));
+        symTable->pushScope();
+
+        cloneSymbol(sym);
+        copyFirstPrivateSymbol(sym);
+
+        firOpBuilder.create<mlir::omp::YieldOp>(
+            hsb.getAddr().getLoc(),
+            symTable->shallowLookupSymbol(*sym).getAddr());
+
+        symTable->popScope();
+        symTable->popScope();
+        firOpBuilder.restoreInsertionPoint(ip);
+
+        privateInitializers.insert(mlir::SymbolRefAttr::get(privatizerOp));
+        privateSymHostAddrsses.insert(hsb.getAddr());
+      } else {
+        cloneSymbol(sym);
+        copyFirstPrivateSymbol(sym);
+      }
     }
   }
 }
@@ -2463,12 +2518,12 @@ static OpTy genOpWithBody(Fortran::lower::AbstractConverter &converter,
                           Fortran::lower::pft::Evaluation &eval, bool genNested,
                           mlir::Location currentLocation, bool outerCombined,
                           const Fortran::parser::OmpClauseList *clauseList,
-                          Args &&...args) {
+                          DataSharingProcessor *dsp, Args &&...args) {
   auto op = converter.getFirOpBuilder().create<OpTy>(
       currentLocation, std::forward<Args>(args)...);
   createBodyOfOp<OpTy>(op, converter, currentLocation, eval, genNested,
                        clauseList,
-                       /*args=*/{}, outerCombined);
+                       /*args=*/{}, outerCombined, dsp);
   return op;
 }
 
@@ -2480,6 +2535,7 @@ genMasterOp(Fortran::lower::AbstractConverter &converter,
                                             currentLocation,
                                             /*outerCombined=*/false,
                                             /*clauseList=*/nullptr,
+                                            /*dsp=*/nullptr,
                                             /*resultTypes=*/mlir::TypeRange());
 }
 
@@ -2487,14 +2543,17 @@ static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::pft::Evaluation &eval, bool genNested,
                    mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false,
-      /*clauseList=*/nullptr, /*simd=*/false);
+  return genOpWithBody<mlir::omp::OrderedRegionOp>(converter, eval, genNested,
+                                                   currentLocation,
+                                                   /*outerCombined=*/false,
+                                                   /*clauseList=*/nullptr,
+                                                   /*dsp=*/nullptr,
+                                                   /*simd=*/false);
 }
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
               const Fortran::parser::OmpClauseList &clauseList,
@@ -2516,16 +2575,37 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   if (!outerCombined)
     cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols);
 
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, clauseList, eval,
+                           /*useDelayedPrivatizationWhenPossible=*/true,
+                           &symTable);
+
+  if (privatize) {
+    dsp.processStep1();
+  }
+
+  llvm::SmallVector<mlir::Attribute> privateInits(
+      dsp.getPrivateInitializers().begin(), dsp.getPrivateInitializers().end());
+
+  llvm::SmallVector<mlir::Value> privateSymAddresses(
+      dsp.getPrivateSymHostAddrsses().begin(),
+      dsp.getPrivateSymHostAddrsses().end());
+
   return genOpWithBody<mlir::omp::ParallelOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+      &dsp,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
-      reductionVars,
+      reductionVars, privateSymAddresses,
       reductionDeclSymbols.empty()
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr);
+      procBindKindAttr,
+      privateInits.empty()
+          ? nullptr
+          : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                 privateInits));
 }
 
 static mlir::omp::SectionOp
@@ -2537,7 +2617,8 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &sectionsClauseList);
+      /*outerCombined=*/false, &sectionsClauseList,
+      /*dsp=*/nullptr);
 }
 
 static mlir::omp::SingleOp
@@ -2558,8 +2639,8 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::SingleOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &beginClauseList, allocateOperands,
-      allocatorOperands, nowaitAttr);
+      /*outerCombined=*/false, &beginClauseList, /*dsp=*/nullptr,
+      allocateOperands, allocatorOperands, nowaitAttr);
 }
 
 static mlir::omp::TaskOp
@@ -2591,8 +2672,8 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TaskOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &clauseList, ifClauseOperand, finalClauseOperand,
-      untiedAttr, mergeableAttr,
+      /*outerCombined=*/false, &clauseList, /*dsp=*/nullptr, ifClauseOperand,
+      finalClauseOperand, untiedAttr, mergeableAttr,
       /*in_reduction_vars=*/mlir::ValueRange(),
       /*in_reductions=*/nullptr, priorityClauseOperand,
       dependTypeOperands.empty()
@@ -2615,6 +2696,7 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
   return genOpWithBody<mlir::omp::TaskGroupOp>(
       converter, eval, genNested, currentLocation,
       /*outerCombined=*/false, &clauseList,
+      /*dsp=*/nullptr,
       /*task_reduction_vars=*/mlir::ValueRange(),
       /*task_reductions=*/nullptr, allocateOperands, allocatorOperands);
 }
@@ -2994,6 +3076,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+      /*dsp=*/nullptr,
       /*num_teams_lower=*/nullptr, numTeamsClauseOperand, ifClauseOperand,
       threadLimitClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -3392,8 +3475,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                    loopOpClauseList,
+      genParallelOp(converter, symTable, eval, /*genNested=*/false,
+                    currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
   }
@@ -3481,8 +3564,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     genOrderedRegionOp(converter, eval, /*genNested=*/true, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, eval, /*genNested=*/true, currentLocation,
-                  beginClauseList);
+    genParallelOp(converter, symTable, eval, /*genNested=*/true,
+                  currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, eval, /*genNested=*/true, currentLocation,
@@ -3541,8 +3624,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                  beginClauseList, outerCombined);
+    genParallelOp(converter, symTable, eval, /*genNested=*/false,
+                  currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
   if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet)
@@ -3625,7 +3708,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, eval,
+    genParallelOp(converter, symTable, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
@@ -3642,6 +3725,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                                        /*genNested=*/false, currentLocation,
                                        /*outerCombined=*/false,
                                        /*clauseList=*/nullptr,
+                                       /*dsp=*/nullptr,
                                        /*reduction_vars=*/mlir::ValueRange(),
                                        /*reductions=*/nullptr, allocateOperands,
                                        allocatorOperands, nowaitClauseOperand);
diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
new file mode 100644
index 0000000000000..6a668e6fb6660
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
@@ -0,0 +1,43 @@
+subroutine delayed_privatization()
+  integer :: var1
+  integer :: var2
+
+!$OMP PARALLEL FIRSTPRIVATE(var1, var2)
+  var1 = var1 + var2 + 2
+!$OMP END PARALLEL
+
+end subroutine
+
+! This is what flang emits with the PoC:
+! --------------------------------------
+!
+!func.func @_QPdelayed_privatization() {
+!  %0 = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatizationEvar1"}
+!  %1 = fir.alloca i32 {bindc_name = "var2", uniq_name = "_QFdelayed_privatizationEvar2"}
+!  omp.parallel private(@var1.privatizer %0, @var2.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>) {
+!    %2 = fir.load %0 : !fir.ref<i32>
+!    %3 = fir.load %1 : !fir.ref<i32>
+!    %4 = arith.addi %2, %3 : i32
+!    %c2_i32 = arith.constant 2 : i32
+!    %5 = arith.addi %4, %c2_i32 : i32
+!    fir.store %5 to %0 : !fir.ref<i32>
+!    omp.terminator
+!  }
+!  return
+!}
+!
+!"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var1.privatizer"}> ({
+!^bb0(%arg0: !fir.ref<i32>):
+!  %0 = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatizationEvar1"}
+!  %1 = fir.load %arg0 : !fir.ref<i32>
+!  fir.store %1 to %0 : !fir.ref<i32>
+!  omp.yield(%0 : !fir.ref<i32>)
+!}) : () -> ()
+!
+!"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var2.privatizer"}> ({
+!^bb0(%arg0: !fir.ref<i32>):
+!  %0 = fir.alloca i32 {bindc_name = "var2", pinned, uniq_name = "_QFdelayed_privatizationEvar2"}
+!  %1 = fir.load %arg0 : !fir.ref<i32>
+!  fir.store %1 to %0 : !fir.ref<i32>
+!  omp.yield(%0 : !fir.ref<i32>)
+!}) : () -> ()
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 451828ec4ba77..06fba004a30de 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -16,6 +16,7 @@
 
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
@@ -178,8 +179,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
              Variadic<AnyType>:$allocate_vars,
              Variadic<AnyType>:$allocators_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
+             Variadic<AnyType>:$private_vars,
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
-             OptionalAttr<ProcBindKindAttr>:$proc_bind_val);
+             OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
+             OptionalAttr<SymbolRefArrayAttr>:$private_inits);
 
   let regions = (region AnyRegion:$region);
 
@@ -203,6 +206,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `private` `(`
+              custom<PrivateVarList>(
+                $private_vars, type($private_vars), $private_inits
+              ) `)`
     ) $region attr-dict
   }];
   let hasVerifier = 1;
@@ -612,7 +619,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
 def YieldOp : OpenMP_Op<"yield",
     [Pure, ReturnLike, Terminator,
      ParentOneOf<["WsLoopOp", "ReductionDeclareOp",
-     "AtomicUpdateOp", "SimdLoopOp"]>]> {
+     "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "omp.yield" yields SSA values from the OpenMP dialect op region and
@@ -1469,6 +1476,38 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
 //===----------------------------------------------------------------------===//
 // 2.14.5 target construct
 //===----------------------------------------------------------------------===//
+def PrivateClauseOp : OpenMP_Op<"private", [
+    IsolatedFromAbove, FunctionOpInterface
+  ]> {
+  let summary = "TODO";
+  let description = [{}];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<FunctionType>:$function_type);
+
+  let regions = (region AnyRegion:$body);
+
+  let builders = [OpBuilder<(ins
+    "::mlir::Type":$privateVar,
+    "::llvm::StringRef":$privateVarName
+  )>];
+
+  let extraClassDeclaration = [{
+    ::mlir::Region *getCallableRegion() {
+      return &getBody();
+    }
+
+    /// Returns the argument types of this function.
+    ArrayRef<Type> getArgumentTypes() {
+      return getFunctionType().getInputs();
+    }
+
+    /// Returns the result types of this function.
+    ArrayRef<Type> getResultTypes() {
+      return getFunctionType().getResults();
+    }
+  }];
+}
 
 def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, OutlineableOpenMPOpInterface, AttrSizedOperandSegments]> {
   let summary = "target construct";
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 2f8b3f7e11de1..b381aaf20bf89 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -419,8 +419,10 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         /* allocate_vars = */ llvm::SmallVector<Value>{},
         /* allocators_vars = */ llvm::SmallVector<Value>{},
         /* reduction_vars = */ llvm::SmallVector<Value>{},
+        /*private_vars=*/mlir::ValueRange{},
         /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{});
+        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /*private_inits*/ nullptr);
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 13cc16125a273..c4ef7ef3f2fb5 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -989,8 +989,9 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
   ParallelOp::build(
       builder, state, /*if_expr_var=*/nullptr, /*num_threads_var=*/nullptr,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
-      /*proc_bind_val=*/nullptr);
+      /*reduction_vars=*/ValueRange(), /*private_vars=*/ValueRange(),
+      /*reductions=*/nullptr,
+      /*proc_bind_val=*/nullptr, /*private_inits*/ nullptr);
   state.addAttributes(attributes);
 }
 
@@ -1594,6 +1595,93 @@ LogicalResult DataBoundsOp::verify() {
   return success();
 }
 
+void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                            Type privateVarType, StringRef privateVarName) {
+  FunctionType initializerType = FunctionType::get(
+      odsBuilder.getContext(), {privateVarType}, {privateVarType});
+  std::string privatizerName = (privateVarName + ".privatizer").str();
+
+  build(odsBuilder, odsState, privatizerName, initializerType);
+
+  mlir::Block &block = odsState.regions.front()->emplaceBlock();
+  block.addArgument(privateVarType, odsState.location);
+}
+
+static ParseResult parsePrivateVarList(
+    OpAsmParser &parser,
+    llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> &privateVarsOperands,
+    llvm::SmallVector<Type, 1> &privateVarsTypes, ArrayAttr &privateInitsAttr) {
+  SymbolRefAttr privatizerSym;
+  OpAsmParser::UnresolvedOperand arg;
+  OpAsmParser::UnresolvedOperand blockArg;
+  Type argType;
+
+  SmallVector<SymbolRefAttr> privateInitsVec;
+
+  auto parsePrivatizers = [&]() -> ParseResult {
+    if (parser.parseAttribute(privatizerSym) || parser.parseOperand(arg)) {
+      return failure();
+    }
+
+    privateInitsVec.push_back(privatizerSym);
+    privateVarsOperands.push_back(arg);
+    return success();
+  };
+
+  auto parseTypes = [&]() -> ParseResult {
+    if (parser.parseType(argType))
+      return failure();
+    privateVarsTypes.push_back(argType);
+    return success();
+  };
+
+  if (parser.parseCommaSeparatedList(parsePrivatizers))
+    return failure();
+
+  SmallVector<Attribute> privateInits(privateInitsVec.begin(),
+                                      privateInitsVec.end());
+  privateInitsAttr = ArrayAttr::get(parser.getContext(), privateInits);
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseCommaSeparatedList(parseTypes))
+    return failure();
+
+  return success();
+}
+
+static void printPrivateVarList(OpAsmPrinter &printer, Operation *op,
+                                OperandRange privateVars,
+                                TypeRange privateVarTypes,
+                                std::optional<ArrayAttr> privateInitsAttr) {
+  unsigned argIndex = 0;
+  assert(privateVars.size() == privateVarTypes.size() &&
+         ((privateVars.empty()) ||
+          (*privateInitsAttr &&
+           (privateInitsAttr->size() == privateVars.size()))));
+
+  for (const auto &privateVar : privateVars) {
+    assert(privateInitsAttr);
+    const auto &privateInitSym = (*privateInitsAttr)[argIndex];
+    printer << privateInitSym << " " << privateVar;
+
+    argIndex++;
+    if (argIndex < privateVars.size())
+      printer << ", ";
+  }
+
+  printer << " : ";
+
+  argIndex = 0;
+  for (const auto &mapType : privateVarTypes) {
+    printer << mapType;
+    argIndex++;
+    if (argIndex < privateVarTypes.size())
+      printer << ", ";
+  }
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/test/Dialect/OpenMP/roundtrip.mlir b/mlir/test/Dialect/OpenMP/roundtrip.mlir
new file mode 100644
index 0000000000000..c6e9fab6f7f98
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/roundtrip.mlir
@@ -0,0 +1,36 @@
+// RUN: fir-opt -verify-diagnostics %s | fir-opt | FileCheck %s
+
+// CHECK-LABEL: _QPprivate_clause
+func.func @_QPprivate_clause() {
+  %0 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFprivate_clause_allocatableEx"}
+  %1 = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFprivate_clause_allocatableEy"}
+
+  // CHECK: omp.parallel private(@x.privatizer %0, @y.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel private(@x.privatizer %0, @y.privatizer %1: !fir.ref<i32>, !fir.ref<i32>) {
+    omp.terminator
+  }
+  return
+}
+
+// CHECK: "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "x.privatizer"}> ({
+"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "x.privatizer"}> ({
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !fir.ref<i32>):
+
+  // CHECK: %0 = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_clause_allocatableEx"}
+  %0 = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_clause_allocatableEx"}
+
+  // CHECK: omp.yield(%0 : !fir.ref<i32>)
+  omp.yield(%0 : !fir.ref<i32>)
+}) : () -> ()
+
+// CHECK: "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "y.privatizer"}> ({
+"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "y.privatizer"}> ({
+^bb0(%arg0: !fir.ref<i32>):
+
+  // CHECK: %0 = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_clause_allocatableEy"}
+  %0 = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_clause_allocatableEy"}
+
+  // CHECK: omp.yield(%0 : !fir.ref<i32>)
+  omp.yield(%0 : !fir.ref<i32>)
+}) : () -> ()

>From 02543684c9baa7cb295f1d2087dce996fd03c4a8 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 31 Jan 2024 08:34:54 -0600
Subject: [PATCH 2/6] Add conversion patttern for `PrivateClauseOp`.

---
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 730858ffc67a7..d4ccbdf608293 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -46,6 +46,17 @@ struct RegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
                                            *this->getTypeConverter())))
       return failure();
 
+    if constexpr (std::is_same_v<OpType, mlir::omp::PrivateClauseOp>) {
+      auto llvmType = this->getTypeConverter()->convertType(
+          adaptor.getFunctionType().getInput(0));
+
+      if (!llvmType)
+        return rewriter.notifyMatchFailure(curOp,
+                                           "signature conversion failed");
+      newOp.setFunctionType(
+          FunctionType::get(rewriter.getContext(), {llvmType}, {llvmType}));
+    }
+
     rewriter.eraseOp(curOp);
     return success();
   }
@@ -231,11 +242,12 @@ void mlir::configureOpenMPToLLVMConversionLegality(
       mlir::omp::DataOp, mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp,
       mlir::omp::WsLoopOp, mlir::omp::SimdLoopOp, mlir::omp::MasterOp,
       mlir::omp::SectionOp, mlir::omp::SectionsOp, mlir::omp::SingleOp,
-      mlir::omp::TaskGroupOp, mlir::omp::TaskOp>([&](Operation *op) {
-    return typeConverter.isLegal(&op->getRegion(0)) &&
-           typeConverter.isLegal(op->getOperandTypes()) &&
-           typeConverter.isLegal(op->getResultTypes());
-  });
+      mlir::omp::TaskGroupOp, mlir::omp::TaskOp, mlir::omp::PrivateClauseOp>(
+      [&](Operation *op) {
+        return typeConverter.isLegal(&op->getRegion(0)) &&
+               typeConverter.isLegal(op->getOperandTypes()) &&
+               typeConverter.isLegal(op->getResultTypes());
+      });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp,
       mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp,
@@ -275,6 +287,7 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       RegionOpConversion<omp::SimdLoopOp>, RegionOpConversion<omp::SingleOp>,
       RegionOpConversion<omp::TaskGroupOp>, RegionOpConversion<omp::TaskOp>,
       RegionOpConversion<omp::DataOp>, RegionOpConversion<omp::TargetOp>,
+      RegionOpConversion<omp::PrivateClauseOp>,
       RegionLessOpWithVarOperandsConversion<omp::AtomicWriteOp>,
       RegionOpWithVarOperandsConversion<omp::AtomicUpdateOp>,
       RegionLessOpWithVarOperandsConversion<omp::FlushOp>,

>From 7f15878f6e5b993b9542d8e72e772fa7668d659c Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 2 Feb 2024 01:33:01 -0600
Subject: [PATCH 3/6] Convert private clauses to LLVM.

---
 .../OpenMP/FIR/delayed_privatization.f90      | 192 +++++++++++++++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  82 +++++++-
 2 files changed, 240 insertions(+), 34 deletions(-)

diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
index 6a668e6fb6660..c6c8523beb676 100644
--- a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
@@ -1,43 +1,179 @@
+! TODO Convert this file into a bunch of lit tests for each conversion step.
+
 subroutine delayed_privatization()
   integer :: var1
   integer :: var2
 
+  var1 = 111
+  var2 = 222
+
 !$OMP PARALLEL FIRSTPRIVATE(var1, var2)
   var1 = var1 + var2 + 2
 !$OMP END PARALLEL
 
 end subroutine
 
-! This is what flang emits with the PoC:
-! --------------------------------------
+! -----------------------------------------
+! ## This is what flang emits with the PoC:
+! -----------------------------------------
 !
-!func.func @_QPdelayed_privatization() {
-!  %0 = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatizationEvar1"}
-!  %1 = fir.alloca i32 {bindc_name = "var2", uniq_name = "_QFdelayed_privatizationEvar2"}
-!  omp.parallel private(@var1.privatizer %0, @var2.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>) {
-!    %2 = fir.load %0 : !fir.ref<i32>
-!    %3 = fir.load %1 : !fir.ref<i32>
-!    %4 = arith.addi %2, %3 : i32
-!    %c2_i32 = arith.constant 2 : i32
-!    %5 = arith.addi %4, %c2_i32 : i32
-!    fir.store %5 to %0 : !fir.ref<i32>
-!    omp.terminator
+! ----------------------------
+! ### Conversion to FIR + OMP:
+! ----------------------------
+!module {
+!  func.func @_QPdelayed_privatization() {
+!    %0 = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1 = fir.alloca i32 {bindc_name = "var2", uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %c111_i32 = arith.constant 111 : i32
+!    fir.store %c111_i32 to %0 : !fir.ref<i32>
+!    %c222_i32 = arith.constant 222 : i32
+!    fir.store %c222_i32 to %1 : !fir.ref<i32>
+!    omp.parallel private(@var1.privatizer %0, @var2.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>) {
+!      %2 = fir.load %0 : !fir.ref<i32>
+!      %3 = fir.load %1 : !fir.ref<i32>
+!      %4 = arith.addi %2, %3 : i32
+!      %c2_i32 = arith.constant 2 : i32
+!      %5 = arith.addi %4, %c2_i32 : i32
+!      fir.store %5 to %0 : !fir.ref<i32>
+!      omp.terminator
+!    }
+!    return
 !  }
-!  return
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var1.privatizer"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1 = fir.load %arg0 : !fir.ref<i32>
+!    fir.store %1 to %0 : !fir.ref<i32>
+!    omp.yield(%0 : !fir.ref<i32>)
+!  }) : () -> ()
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var2.privatizer"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var2", pinned, uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %1 = fir.load %arg0 : !fir.ref<i32>
+!    fir.store %1 to %0 : !fir.ref<i32>
+!    omp.yield(%0 : !fir.ref<i32>)
+!  }) : () -> ()
 !}
 !
-!"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var1.privatizer"}> ({
-!^bb0(%arg0: !fir.ref<i32>):
-!  %0 = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatizationEvar1"}
-!  %1 = fir.load %arg0 : !fir.ref<i32>
-!  fir.store %1 to %0 : !fir.ref<i32>
-!  omp.yield(%0 : !fir.ref<i32>)
-!}) : () -> ()
+! -----------------------------
+! ### Conversion to LLVM + OMP:
+! -----------------------------
+!module {
+!  llvm.func @_QPdelayed_privatization() {
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var1"} : (i64) -> !llvm.ptr
+!    %2 = llvm.mlir.constant(1 : i64) : i64
+!    %3 = llvm.alloca %2 x i32 {bindc_name = "var2"} : (i64) -> !llvm.ptr
+!    %4 = llvm.mlir.constant(111 : i32) : i32
+!    llvm.store %4, %1 : i32, !llvm.ptr
+!    %5 = llvm.mlir.constant(222 : i32) : i32
+!    llvm.store %5, %3 : i32, !llvm.ptr
+!    omp.parallel private(@var1.privatizer %1, @var2.privatizer %3 : !llvm.ptr, !llvm.ptr) {
+!      %6 = llvm.load %1 : !llvm.ptr -> i32
+!      %7 = llvm.load %3 : !llvm.ptr -> i32
+!      %8 = llvm.add %6, %7  : i32
+!      %9 = llvm.mlir.constant(2 : i32) : i32
+!      %10 = llvm.add %8, %9  : i32
+!      llvm.store %10, %1 : i32, !llvm.ptr
+!      omp.terminator
+!    }
+!    llvm.return
+!  }
+!  "omp.private"() <{function_type = (!llvm.ptr) -> !llvm.ptr, sym_name = "var1.privatizer"}> ({
+!  ^bb0(%arg0: !llvm.ptr):
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var1", pinned} : (i64) -> !llvm.ptr
+!    %2 = llvm.load %arg0 : !llvm.ptr -> i32
+!    llvm.store %2, %1 : i32, !llvm.ptr
+!    omp.yield(%1 : !llvm.ptr)
+!  }) : () -> ()
+!  "omp.private"() <{function_type = (!llvm.ptr) -> !llvm.ptr, sym_name = "var2.privatizer"}> ({
+!  ^bb0(%arg0: !llvm.ptr):
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var2", pinned} : (i64) -> !llvm.ptr
+!    %2 = llvm.load %arg0 : !llvm.ptr -> i32
+!    llvm.store %2, %1 : i32, !llvm.ptr
+!    omp.yield(%1 : !llvm.ptr)
+!  }) : () -> ()
+!}
 !
-!"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var2.privatizer"}> ({
-!^bb0(%arg0: !fir.ref<i32>):
-!  %0 = fir.alloca i32 {bindc_name = "var2", pinned, uniq_name = "_QFdelayed_privatizationEvar2"}
-!  %1 = fir.load %arg0 : !fir.ref<i32>
-!  fir.store %1 to %0 : !fir.ref<i32>
-!  omp.yield(%0 : !fir.ref<i32>)
-!}) : () -> ()
+! --------------------------
+! ### Conversion to LLVM IR:
+! --------------------------
+!%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+!@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+!@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+
+!define void @_QPdelayed_privatization() {
+!  %structArg = alloca { ptr, ptr }, align 8
+!  %1 = alloca i32, i64 1, align 4
+!  %2 = alloca i32, i64 1, align 4
+!  store i32 111, ptr %1, align 4
+!  store i32 222, ptr %2, align 4
+!  br label %entry
+
+!entry:                                            ; preds = %0
+!  %omp_global_thread_num = call i32 @__kmpc_global_thread_num(ptr @1)
+!  br label %omp_parallel
+
+!omp_parallel:                                     ; preds = %entry
+!  %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
+!  store ptr %1, ptr %gep_, align 8
+!  %gep_2 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
+!  store ptr %2, ptr %gep_2, align 8
+!  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @_QPdelayed_privatization..omp_par, ptr %structArg)
+!  br label %omp.par.outlined.exit
+
+!omp.par.outlined.exit:                            ; preds = %omp_parallel
+!  br label %omp.par.exit.split
+
+!omp.par.exit.split:                               ; preds = %omp.par.outlined.exit
+!  ret void
+!}
+
+!; Function Attrs: nounwind
+!define internal void @_QPdelayed_privatization..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
+!omp.par.entry:
+!  %gep_ = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 0
+!  %loadgep_ = load ptr, ptr %gep_, align 8
+!  %gep_1 = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 1
+!  %loadgep_2 = load ptr, ptr %gep_1, align 8
+!  %tid.addr.local = alloca i32, align 4
+!  %1 = load i32, ptr %tid.addr, align 4
+!  store i32 %1, ptr %tid.addr.local, align 4
+!  %tid = load i32, ptr %tid.addr.local, align 4
+!  %2 = alloca i32, i64 1, align 4
+!  %3 = load i32, ptr %loadgep_, align 4
+!  store i32 %3, ptr %2, align 4
+!  %4 = alloca i32, i64 1, align 4
+!  %5 = load i32, ptr %loadgep_2, align 4
+!  store i32 %5, ptr %4, align 4
+!  br label %omp.par.region
+
+!omp.par.region:                                   ; preds = %omp.par.entry
+!  br label %omp.par.region1
+
+!omp.par.region1:                                  ; preds = %omp.par.region
+!  %6 = load i32, ptr %2, align 4
+!  %7 = load i32, ptr %4, align 4
+!  %8 = add i32 %6, %7
+!  %9 = add i32 %8, 2
+!  store i32 %9, ptr %2, align 4
+!  br label %omp.region.cont
+
+!omp.region.cont:                                  ; preds = %omp.par.region1
+!  br label %omp.par.pre_finalize
+
+!omp.par.pre_finalize:                             ; preds = %omp.region.cont
+!  br label %omp.par.outlined.exit.exitStub
+
+!omp.par.outlined.exit.exitStub:                   ; preds = %omp.par.pre_finalize
+!  ret void
+!}
+
+!; Function Attrs: nounwind
+!declare i32 @__kmpc_global_thread_num(ptr) #0
+
+!; Function Attrs: nounwind
+!declare !callback !2 void @__kmpc_fork_call(ptr, i32, ptr, ...) #0
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 17ce14fe642be..721568792d8f8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1092,6 +1092,75 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar,
+          privInit] = [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privInits = opInst.getPrivateInits();
+        assert(privInits && privInits->size() == privVars.size());
+
+        const auto *privInitIt = privInits->begin();
+        for (auto privVarIt = privVars.begin(); privVarIt != privVars.end();
+             ++privVarIt, ++privInitIt) {
+          auto *llvmPrivVarOp = moduleTranslation.lookupValue(*privVarIt);
+          if (llvmPrivVarOp != &vPtr) {
+            continue;
+          }
+
+          auto privSym = llvm::cast<SymbolRefAttr>(*privInitIt);
+          auto privOp =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          return {*privVarIt, privOp};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      assert(privInit->getRegions().front().getNumArguments() == 1);
+
+      auto arg = privInit->getRegions().front().getArgument(0);
+      for (auto &op : privInit->getRegions().front().front()) {
+        op.replaceUsesOfWith(arg, privVar);
+      }
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      // Temporarily unlink the terminator from its parent since
+      // `inlineConvertOmpRegions` expects the insertion block to **not**
+      // contain a terminator.
+      auto &allocaTerminator = builder.GetInsertBlock()->back();
+      assert(lastInstr.isTerminator());
+      allocaTerminator.removeFromParent();
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(privInit->getRegion(0),
+                                         "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        // TODO proper error-handling.
+        builder.restoreIP(oldIP);
+        return codeGenIP;
+      }
+
+      allocaTerminator.insertAfter(&builder.GetInsertBlock()->back());
+
+      assert(yieldedValues.size() == 1);
+      replacementValue = yieldedValues.front();
+
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -2791,12 +2860,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();

>From fafb20f84c7913e02658199db391642efeaad879 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 2 Feb 2024 04:10:29 -0600
Subject: [PATCH 4/6] Handle some comments.

---
 flang/lib/Lower/OpenMP.cpp                    | 20 +++++------
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  6 ++--
 .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp    |  4 +--
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 34 ++++++++++---------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  2 +-
 5 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 4d012c45108fd..9e66a52d8d958 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -163,7 +163,7 @@ class DataSharingProcessor {
 
   bool useDelayedPrivatizationWhenPossible;
   Fortran::lower::SymMap *symTable;
-  llvm::SetVector<mlir::SymbolRefAttr> privateInitializers;
+  llvm::SetVector<mlir::SymbolRefAttr> privatizers;
   llvm::SetVector<mlir::Value> privateSymHostAddrsses;
 
   bool needBarrier();
@@ -214,8 +214,8 @@ class DataSharingProcessor {
     loopIV = iv;
   }
 
-  const llvm::SetVector<mlir::SymbolRefAttr> &getPrivateInitializers() const {
-    return privateInitializers;
+  const llvm::SetVector<mlir::SymbolRefAttr> &getPrivatizers() const {
+    return privatizers;
   };
 
   const llvm::SetVector<mlir::Value> &getPrivateSymHostAddrsses() const {
@@ -547,7 +547,7 @@ void DataSharingProcessor::privatize() {
         symTable->popScope();
         firOpBuilder.restoreInsertionPoint(ip);
 
-        privateInitializers.insert(mlir::SymbolRefAttr::get(privatizerOp));
+        privatizers.insert(mlir::SymbolRefAttr::get(privatizerOp));
         privateSymHostAddrsses.insert(hsb.getAddr());
       } else {
         cloneSymbol(sym);
@@ -2584,8 +2584,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     dsp.processStep1();
   }
 
-  llvm::SmallVector<mlir::Attribute> privateInits(
-      dsp.getPrivateInitializers().begin(), dsp.getPrivateInitializers().end());
+  llvm::SmallVector<mlir::Attribute> privatizers(dsp.getPrivatizers().begin(),
+                                                 dsp.getPrivatizers().end());
 
   llvm::SmallVector<mlir::Value> privateSymAddresses(
       dsp.getPrivateSymHostAddrsses().begin(),
@@ -2596,16 +2596,16 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
       &dsp,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
-      reductionVars, privateSymAddresses,
+      reductionVars,
       reductionDeclSymbols.empty()
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr,
-      privateInits.empty()
+      procBindKindAttr, privateSymAddresses,
+      privatizers.empty()
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
-                                 privateInits));
+                                 privatizers));
 }
 
 static mlir::omp::SectionOp
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 06fba004a30de..6d8010bb63695 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -179,10 +179,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
              Variadic<AnyType>:$allocate_vars,
              Variadic<AnyType>:$allocators_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
-             Variadic<AnyType>:$private_vars,
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
              OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
-             OptionalAttr<SymbolRefArrayAttr>:$private_inits);
+             Variadic<AnyType>:$private_vars,
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
 
   let regions = (region AnyRegion:$region);
 
@@ -208,7 +208,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
           | `private` `(`
               custom<PrivateVarList>(
-                $private_vars, type($private_vars), $private_inits
+                $private_vars, type($private_vars), $privatizers
               ) `)`
     ) $region attr-dict
   }];
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index b381aaf20bf89..889aa755d8ba4 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -419,10 +419,10 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         /* allocate_vars = */ llvm::SmallVector<Value>{},
         /* allocators_vars = */ llvm::SmallVector<Value>{},
         /* reduction_vars = */ llvm::SmallVector<Value>{},
-        /*private_vars=*/mlir::ValueRange{},
         /* reductions = */ ArrayAttr{},
         /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
-        /*private_inits*/ nullptr);
+        /*private_vars=*/mlir::ValueRange{},
+        /*privatizers=*/nullptr);
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c4ef7ef3f2fb5..a227473c9cdca 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -989,9 +989,10 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
   ParallelOp::build(
       builder, state, /*if_expr_var=*/nullptr, /*num_threads_var=*/nullptr,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*private_vars=*/ValueRange(),
+      /*reduction_vars=*/ValueRange(),
       /*reductions=*/nullptr,
-      /*proc_bind_val=*/nullptr, /*private_inits*/ nullptr);
+      /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
+      /*privatizers*/ nullptr);
   state.addAttributes(attributes);
 }
 
@@ -1610,20 +1611,20 @@ void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
 static ParseResult parsePrivateVarList(
     OpAsmParser &parser,
     llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> &privateVarsOperands,
-    llvm::SmallVector<Type, 1> &privateVarsTypes, ArrayAttr &privateInitsAttr) {
+    llvm::SmallVector<Type, 1> &privateVarsTypes, ArrayAttr &privatizersAttr) {
   SymbolRefAttr privatizerSym;
   OpAsmParser::UnresolvedOperand arg;
   OpAsmParser::UnresolvedOperand blockArg;
   Type argType;
 
-  SmallVector<SymbolRefAttr> privateInitsVec;
+  SmallVector<SymbolRefAttr> privatizersVec;
 
   auto parsePrivatizers = [&]() -> ParseResult {
     if (parser.parseAttribute(privatizerSym) || parser.parseOperand(arg)) {
       return failure();
     }
 
-    privateInitsVec.push_back(privatizerSym);
+    privatizersVec.push_back(privatizerSym);
     privateVarsOperands.push_back(arg);
     return success();
   };
@@ -1638,9 +1639,9 @@ static ParseResult parsePrivateVarList(
   if (parser.parseCommaSeparatedList(parsePrivatizers))
     return failure();
 
-  SmallVector<Attribute> privateInits(privateInitsVec.begin(),
-                                      privateInitsVec.end());
-  privateInitsAttr = ArrayAttr::get(parser.getContext(), privateInits);
+  SmallVector<Attribute> privatizers(privatizersVec.begin(),
+                                     privatizersVec.end());
+  privatizersAttr = ArrayAttr::get(parser.getContext(), privatizers);
 
   if (parser.parseColon())
     return failure();
@@ -1654,17 +1655,18 @@ static ParseResult parsePrivateVarList(
 static void printPrivateVarList(OpAsmPrinter &printer, Operation *op,
                                 OperandRange privateVars,
                                 TypeRange privateVarTypes,
-                                std::optional<ArrayAttr> privateInitsAttr) {
+                                std::optional<ArrayAttr> privatizersAttr) {
   unsigned argIndex = 0;
-  assert(privateVars.size() == privateVarTypes.size() &&
-         ((privateVars.empty()) ||
-          (*privateInitsAttr &&
-           (privateInitsAttr->size() == privateVars.size()))));
+  // TODO Add an op verifier instead of this assertion.
+  assert(
+      privateVars.size() == privateVarTypes.size() &&
+      ((privateVars.empty()) ||
+       (*privatizersAttr && (privatizersAttr->size() == privateVars.size()))));
 
   for (const auto &privateVar : privateVars) {
-    assert(privateInitsAttr);
-    const auto &privateInitSym = (*privateInitsAttr)[argIndex];
-    printer << privateInitSym << " " << privateVar;
+    assert(privatizersAttr);
+    const auto &privatizerSym = (*privatizersAttr)[argIndex];
+    printer << privatizerSym << " " << privateVar;
 
     argIndex++;
     if (argIndex < privateVars.size())
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 721568792d8f8..8c55d7fb75679 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1099,7 +1099,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           privInit] = [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
       if (!opInst.getPrivateVars().empty()) {
         auto privVars = opInst.getPrivateVars();
-        auto privInits = opInst.getPrivateInits();
+        auto privInits = opInst.getPrivatizers();
         assert(privInits && privInits->size() == privVars.size());
 
         const auto *privInitIt = privInits->begin();

>From c46026c10544edf00f701a99f702b53b599a176e Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 2 Feb 2024 09:20:54 -0600
Subject: [PATCH 5/6] Add region args.

---
 flang/lib/Lower/OpenMP.cpp                    | 195 ++++++++++++------
 .../OpenMP/FIR/delayed_privatization.f90      |  15 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  24 +++
 3 files changed, 164 insertions(+), 70 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 9e66a52d8d958..a1863229a3280 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -147,6 +147,14 @@ static void genNestedEvaluations(Fortran::lower::AbstractConverter &converter,
 //===----------------------------------------------------------------------===//
 
 class DataSharingProcessor {
+public:
+  struct DelayedPrivatizationInfo {
+    llvm::SetVector<mlir::SymbolRefAttr> privatizers;
+    llvm::SetVector<mlir::Value> hostAddresses;
+    llvm::SetVector<const Fortran::semantics::Symbol *> hostSymbols;
+  };
+
+private:
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
@@ -163,8 +171,8 @@ class DataSharingProcessor {
 
   bool useDelayedPrivatizationWhenPossible;
   Fortran::lower::SymMap *symTable;
-  llvm::SetVector<mlir::SymbolRefAttr> privatizers;
-  llvm::SetVector<mlir::Value> privateSymHostAddrsses;
+
+  DelayedPrivatizationInfo delayedPrivatizationInfo;
 
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
@@ -214,12 +222,8 @@ class DataSharingProcessor {
     loopIV = iv;
   }
 
-  const llvm::SetVector<mlir::SymbolRefAttr> &getPrivatizers() const {
-    return privatizers;
-  };
-
-  const llvm::SetVector<mlir::Value> &getPrivateSymHostAddrsses() const {
-    return privateSymHostAddrsses;
+  const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
+    return delayedPrivatizationInfo;
   }
 };
 
@@ -547,8 +551,10 @@ void DataSharingProcessor::privatize() {
         symTable->popScope();
         firOpBuilder.restoreInsertionPoint(ip);
 
-        privatizers.insert(mlir::SymbolRefAttr::get(privatizerOp));
-        privateSymHostAddrsses.insert(hsb.getAddr());
+        delayedPrivatizationInfo.privatizers.insert(
+            mlir::SymbolRefAttr::get(privatizerOp));
+        delayedPrivatizationInfo.hostAddresses.insert(hsb.getAddr());
+        delayedPrivatizationInfo.hostSymbols.insert(sym);
       } else {
         cloneSymbol(sym);
         copyFirstPrivateSymbol(sym);
@@ -2305,7 +2311,9 @@ static void createBodyOfOp(
     Op &op, Fortran::lower::AbstractConverter &converter, mlir::Location &loc,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     const Fortran::parser::OmpClauseList *clauses = nullptr,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &args = {},
+    std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        mlir::Operation *)>
+        genRegionEntryCB = nullptr,
     bool outerCombined = false, DataSharingProcessor *dsp = nullptr) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
@@ -2319,27 +2327,15 @@ static void createBodyOfOp(
   // argument. Also update the symbol's address with the mlir argument value.
   // e.g. For loops the argument is the induction variable. And all further
   // uses of the induction variable should use this mlir value.
-  if (args.size()) {
-    std::size_t loopVarTypeSize = 0;
-    for (const Fortran::semantics::Symbol *arg : args)
-      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-    llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
-    llvm::SmallVector<mlir::Location> locs(args.size(), loc);
-    firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
-    // The argument is not currently in memory, so make a temporary for the
-    // argument, and store it there, then bind that location to the argument.
-    mlir::Operation *storeOp = nullptr;
-    for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
-      mlir::Value indexVal =
-          fir::getBase(op.getRegion().front().getArgument(argIndex));
-      storeOp =
-          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+  auto regionArgs =
+      [&]() -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
+    if (genRegionEntryCB != nullptr) {
+      return genRegionEntryCB(op);
     }
-    firOpBuilder.setInsertionPointAfter(storeOp);
-  } else {
+
     firOpBuilder.createBlock(&op.getRegion());
-  }
+    return {};
+  }();
 
   // Mark the earliest insertion point.
   mlir::Operation *marker = insertMarker(firOpBuilder);
@@ -2437,8 +2433,8 @@ static void createBodyOfOp(
         assert(tempDsp.has_value());
         tempDsp->processStep2(op, isLoop);
       } else {
-        if (isLoop && args.size() > 0)
-          dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
+        if (isLoop && regionArgs.size() > 0)
+          dsp->setLoopIV(converter.getSymbolAddress(*regionArgs[0]));
         dsp->processStep2(op, isLoop);
       }
     }
@@ -2514,16 +2510,19 @@ static void genBodyOfTargetDataOp(
 }
 
 template <typename OpTy, typename... Args>
-static OpTy genOpWithBody(Fortran::lower::AbstractConverter &converter,
-                          Fortran::lower::pft::Evaluation &eval, bool genNested,
-                          mlir::Location currentLocation, bool outerCombined,
-                          const Fortran::parser::OmpClauseList *clauseList,
-                          DataSharingProcessor *dsp, Args &&...args) {
+static OpTy genOpWithBody(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &eval, bool genNested,
+    mlir::Location currentLocation, bool outerCombined,
+    const Fortran::parser::OmpClauseList *clauseList,
+    std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        mlir::Operation *)>
+        genRegionEntryCB,
+    DataSharingProcessor *dsp, Args &&...args) {
   auto op = converter.getFirOpBuilder().create<OpTy>(
       currentLocation, std::forward<Args>(args)...);
   createBodyOfOp<OpTy>(op, converter, currentLocation, eval, genNested,
-                       clauseList,
-                       /*args=*/{}, outerCombined, dsp);
+                       clauseList, genRegionEntryCB, outerCombined, dsp);
   return op;
 }
 
@@ -2531,24 +2530,24 @@ static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::MasterOp>(converter, eval, genNested,
-                                            currentLocation,
-                                            /*outerCombined=*/false,
-                                            /*clauseList=*/nullptr,
-                                            /*dsp=*/nullptr,
-                                            /*resultTypes=*/mlir::TypeRange());
+  return genOpWithBody<mlir::omp::MasterOp>(
+      converter, eval, genNested, currentLocation,
+      /*outerCombined=*/false,
+      /*clauseList=*/nullptr, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
+      /*resultTypes=*/mlir::TypeRange());
 }
 
 static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::pft::Evaluation &eval, bool genNested,
                    mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::OrderedRegionOp>(converter, eval, genNested,
-                                                   currentLocation,
-                                                   /*outerCombined=*/false,
-                                                   /*clauseList=*/nullptr,
-                                                   /*dsp=*/nullptr,
-                                                   /*simd=*/false);
+  return genOpWithBody<mlir::omp::OrderedRegionOp>(
+      converter, eval, genNested, currentLocation,
+      /*outerCombined=*/false,
+      /*clauseList=*/nullptr, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
+      /*simd=*/false);
 }
 
 static mlir::omp::ParallelOp
@@ -2584,16 +2583,44 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     dsp.processStep1();
   }
 
-  llvm::SmallVector<mlir::Attribute> privatizers(dsp.getPrivatizers().begin(),
-                                                 dsp.getPrivatizers().end());
+  const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();
+  llvm::SmallVector<mlir::Attribute> privatizers(
+      delayedPrivatizationInfo.privatizers.begin(),
+      delayedPrivatizationInfo.privatizers.end());
 
   llvm::SmallVector<mlir::Value> privateSymAddresses(
-      dsp.getPrivateSymHostAddrsses().begin(),
-      dsp.getPrivateSymHostAddrsses().end());
+      delayedPrivatizationInfo.hostAddresses.begin(),
+      delayedPrivatizationInfo.hostAddresses.end());
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+    auto privateVars = parallelOp.getPrivateVars();
+    auto &region = parallelOp.getRegion();
+    llvm::SmallVector<mlir::Type> privateVarTypes;
+    llvm::SmallVector<mlir::Location> privateVarLocs;
+
+    for (auto privateVar : privateVars) {
+      privateVarTypes.push_back(privateVar.getType());
+      privateVarLocs.push_back(privateVar.getLoc());
+    }
+
+    converter.getFirOpBuilder().createBlock(&region, {}, privateVarTypes,
+                                            privateVarLocs);
+
+    int argIdx = 0;
+    for (const auto *sym : delayedPrivatizationInfo.hostSymbols) {
+      converter.bindSymbol(*sym, region.getArgument(argIdx));
+      ++argIdx;
+    }
+
+    return llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        delayedPrivatizationInfo.hostSymbols.begin(),
+        delayedPrivatizationInfo.hostSymbols.end());
+  };
 
   return genOpWithBody<mlir::omp::ParallelOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
-      &dsp,
+      genRegionEntryCB, &dsp,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -2618,6 +2645,7 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
   return genOpWithBody<mlir::omp::SectionOp>(
       converter, eval, genNested, currentLocation,
       /*outerCombined=*/false, &sectionsClauseList,
+      /*genRegionEntryCB=*/nullptr,
       /*dsp=*/nullptr);
 }
 
@@ -2639,8 +2667,8 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::SingleOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &beginClauseList, /*dsp=*/nullptr,
-      allocateOperands, allocatorOperands, nowaitAttr);
+      /*outerCombined=*/false, &beginClauseList, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr, allocateOperands, allocatorOperands, nowaitAttr);
 }
 
 static mlir::omp::TaskOp
@@ -2672,8 +2700,9 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TaskOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &clauseList, /*dsp=*/nullptr, ifClauseOperand,
-      finalClauseOperand, untiedAttr, mergeableAttr,
+      /*outerCombined=*/false, &clauseList, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr, ifClauseOperand, finalClauseOperand, untiedAttr,
+      mergeableAttr,
       /*in_reduction_vars=*/mlir::ValueRange(),
       /*in_reductions=*/nullptr, priorityClauseOperand,
       dependTypeOperands.empty()
@@ -2695,7 +2724,7 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_taskgroup);
   return genOpWithBody<mlir::omp::TaskGroupOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &clauseList,
+      /*outerCombined=*/false, &clauseList, /*genRegionEntryCB=*/nullptr,
       /*dsp=*/nullptr,
       /*task_reduction_vars=*/mlir::ValueRange(),
       /*task_reductions=*/nullptr, allocateOperands, allocatorOperands);
@@ -3076,6 +3105,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+      /*genRegionEntryCB=*/nullptr,
       /*dsp=*/nullptr,
       /*num_teams_lower=*/nullptr, numTeamsClauseOperand, ifClauseOperand,
       threadLimitClauseOperand, allocateOperands, allocatorOperands,
@@ -3273,6 +3303,33 @@ static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
   }
 }
 
+static llvm::SmallVector<const Fortran::semantics::Symbol *> genCodeForIterVar(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    const llvm::SmallVector<const Fortran::semantics::Symbol *> &args) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  auto &region = op->getRegion(0);
+
+  std::size_t loopVarTypeSize = 0;
+  for (const Fortran::semantics::Symbol *arg : args)
+    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
+  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
+  firOpBuilder.createBlock(&region, {}, tiv, locs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  mlir::Operation *storeOp = nullptr;
+  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
+    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
+    storeOp =
+        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+  }
+  firOpBuilder.setInsertionPointAfter(storeOp);
+
+  return args;
+}
+
 static void
 createSimdLoop(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::pft::Evaluation &eval,
@@ -3320,9 +3377,14 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(loopOpClauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genCodeForIterVar(op, converter, loc, iv);
+  };
+
   createBodyOfOp<mlir::omp::SimdLoopOp>(simdLoopOp, converter, loc, *nestedEval,
                                         /*genNested=*/true, &loopOpClauseList,
-                                        iv, /*outer=*/false, &dsp);
+                                        ivCallback, /*outer=*/false, &dsp);
 }
 
 static void createWsLoop(Fortran::lower::AbstractConverter &converter,
@@ -3395,8 +3457,14 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genCodeForIterVar(op, converter, loc, iv);
+  };
+
   createBodyOfOp<mlir::omp::WsLoopOp>(wsLoopOp, converter, loc, *nestedEval,
-                                      /*genNested=*/true, &beginClauseList, iv,
+                                      /*genNested=*/true, &beginClauseList,
+                                      ivCallback,
                                       /*outer=*/false, &dsp);
 }
 
@@ -3725,6 +3793,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                                        /*genNested=*/false, currentLocation,
                                        /*outerCombined=*/false,
                                        /*clauseList=*/nullptr,
+                                       /*genRegionEntryCB=*/nullptr,
                                        /*dsp=*/nullptr,
                                        /*reduction_vars=*/mlir::ValueRange(),
                                        /*reductions=*/nullptr, allocateOperands,
diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
index c6c8523beb676..ba97a86145827 100644
--- a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
@@ -29,12 +29,13 @@ subroutine delayed_privatization()
 !    %c222_i32 = arith.constant 222 : i32
 !    fir.store %c222_i32 to %1 : !fir.ref<i32>
 !    omp.parallel private(@var1.privatizer %0, @var2.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>) {
-!      %2 = fir.load %0 : !fir.ref<i32>
-!      %3 = fir.load %1 : !fir.ref<i32>
+!    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+!      %2 = fir.load %arg0 : !fir.ref<i32>
+!      %3 = fir.load %arg1 : !fir.ref<i32>
 !      %4 = arith.addi %2, %3 : i32
 !      %c2_i32 = arith.constant 2 : i32
 !      %5 = arith.addi %4, %c2_i32 : i32
-!      fir.store %5 to %0 : !fir.ref<i32>
+!      fir.store %5 to %arg0 : !fir.ref<i32>
 !      omp.terminator
 !    }
 !    return
@@ -53,7 +54,6 @@ subroutine delayed_privatization()
 !    fir.store %1 to %0 : !fir.ref<i32>
 !    omp.yield(%0 : !fir.ref<i32>)
 !  }) : () -> ()
-!}
 !
 ! -----------------------------
 ! ### Conversion to LLVM + OMP:
@@ -69,12 +69,13 @@ subroutine delayed_privatization()
 !    %5 = llvm.mlir.constant(222 : i32) : i32
 !    llvm.store %5, %3 : i32, !llvm.ptr
 !    omp.parallel private(@var1.privatizer %1, @var2.privatizer %3 : !llvm.ptr, !llvm.ptr) {
-!      %6 = llvm.load %1 : !llvm.ptr -> i32
-!      %7 = llvm.load %3 : !llvm.ptr -> i32
+!    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+!      %6 = llvm.load %arg0 : !llvm.ptr -> i32
+!      %7 = llvm.load %arg1 : !llvm.ptr -> i32
 !      %8 = llvm.add %6, %7  : i32
 !      %9 = llvm.mlir.constant(2 : i32) : i32
 !      %10 = llvm.add %8, %9  : i32
-!      llvm.store %10, %1 : i32, !llvm.ptr
+!      llvm.store %10, %arg0 : i32, !llvm.ptr
 !      omp.terminator
 !    }
 !    llvm.return
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8c55d7fb75679..f5e342f4a8022 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1000,6 +1000,29 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Replace the region arguments of the parallel op (which correspond to private
+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder.
+static void prepareOmpParallel(omp::ParallelOp opInst) {
+  auto &region = opInst.getRegion();
+  auto privateVars = opInst.getPrivateVars();
+
+  auto privateVarsIt = privateVars.begin();
+  for (size_t argIdx = 0; argIdx < region.getNumArguments();
+       ++argIdx, ++privateVarsIt) {
+    for (auto &block : region) {
+      for (auto &op : block) {
+        op.replaceUsesOfWith(region.getArgument(argIdx), *privateVarsIt);
+      }
+    }
+  }
+
+  for (size_t argIdx = 0; argIdx < region.getNumArguments(); ++argIdx) {
+    region.eraseArgument(argIdx);
+  }
+}
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
@@ -1008,6 +1031,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
+  prepareOmpParallel(opInst);
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {

>From b1b1bc1856ffee07ee1379c8b3bc88020e69b999 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Sun, 4 Feb 2024 06:45:36 -0600
Subject: [PATCH 6/6] Fix compliation error.

---
 .../Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f5e342f4a8022..c6e7f3e070377 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1165,7 +1165,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       // `inlineConvertOmpRegions` expects the insertion block to **not**
       // contain a terminator.
       auto &allocaTerminator = builder.GetInsertBlock()->back();
-      assert(lastInstr.isTerminator());
+      assert(allocaTerminator.isTerminator());
       allocaTerminator.removeFromParent();
 
       SmallVector<llvm::Value *, 1> yieldedValues;



More information about the flang-commits mailing list