[flang-commits] [flang] [mlir] [WIP] Delayed privatization. (PR #79862)

Kareem Ergawy via flang-commits flang-commits at lists.llvm.org
Wed Feb 7 06:25:30 PST 2024


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/79862

>From 7db8eb6385c2bbd0a959c272b22c67d37e545886 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 29 Jan 2024 04:45:18 -0600
Subject: [PATCH 1/2] [WIP] Delayed privatization.

This is a PoC for delayed privatization in OpenMP. Instead of directly
emitting privatization code in the frontend, we add a new op to outline
the privatization logic for a symbol and call-like mapping that maps
from the host symbol to an outlined function-like privatizer op.

Later, we would inline the delayed privatizer function-like op in the
OpenMP region to basically get the same code generated directly by the
fronend at the moment.
---
 flang/include/flang/Lower/AbstractConverter.h |   4 +
 flang/lib/Lower/Bridge.cpp                    |   2 +-
 flang/lib/Lower/OpenMP.cpp                    | 314 ++++++++++++++----
 .../OpenMP/FIR/delayed_privatization.f90      | 182 ++++++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  43 ++-
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  |  23 +-
 .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp    |   4 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  93 +++++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 106 +++++-
 mlir/test/Dialect/OpenMP/ops.mlir             |  10 +-
 mlir/test/Dialect/OpenMP/roundtrip.mlir       |  36 ++
 11 files changed, 736 insertions(+), 81 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
 create mode 100644 mlir/test/Dialect/OpenMP/roundtrip.mlir

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 796933a4eb5f68..55bc33e76e5f6e 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -16,6 +16,7 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Lower/LoweringOptions.h"
 #include "flang/Lower/PFTDefs.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Semantics/symbol.h"
 #include "mlir/IR/Builders.h"
@@ -296,6 +297,9 @@ class AbstractConverter {
     return loweringOptions;
   }
 
+  virtual Fortran::lower::SymbolBox
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) = 0;
+
 private:
   /// Options controlling lowering behavior.
   const Fortran::lower::LoweringOptions &loweringOptions;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 579f94ba756841..7a0804d57ff3ad 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1070,7 +1070,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Find the symbol in one level up of symbol map such as for host-association
   /// in OpenMP code or return null.
   Fortran::lower::SymbolBox
-  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) {
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) override {
     if (Fortran::lower::SymbolBox v = localSymbols.lookupOneLevelUpSymbol(sym))
       return v;
     return {};
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 0a68aba162618b..81160086b1e835 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -32,6 +32,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -40,6 +41,12 @@ static llvm::cl::opt<bool> treatIndexAsSection(
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
     llvm::cl::init(true));
 
+static llvm::cl::opt<bool> enableDelayedPrivatization(
+    "openmp-enable-delayed-privatization",
+    llvm::cl::desc(
+        "Emit `[first]private` variables as clauses on the MLIR ops."),
+    llvm::cl::init(false));
+
 using DeclareTargetCapturePair =
     std::pair<mlir::omp::DeclareTargetCaptureClause,
               Fortran::semantics::Symbol>;
@@ -147,6 +154,14 @@ static void genNestedEvaluations(Fortran::lower::AbstractConverter &converter,
 //===----------------------------------------------------------------------===//
 
 class DataSharingProcessor {
+public:
+  struct DelayedPrivatizationInfo {
+    llvm::SetVector<mlir::SymbolRefAttr> privatizers;
+    llvm::SetVector<mlir::Value> hostAddresses;
+    llvm::SetVector<const Fortran::semantics::Symbol *> hostSymbols;
+  };
+
+private:
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
@@ -161,6 +176,12 @@ class DataSharingProcessor {
   const Fortran::parser::OmpClauseList &opClauseList;
   Fortran::lower::pft::Evaluation &eval;
 
+  bool useDelayedPrivatization;
+  llvm::SetVector<mlir::StringRef> existingPrivatizerNames;
+  Fortran::lower::SymMap *symTable;
+
+  DelayedPrivatizationInfo delayedPrivatizationInfo;
+
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
   void collectOmpObjectListSymbol(
@@ -171,6 +192,8 @@ class DataSharingProcessor {
   void collectDefaultSymbols();
   void privatize();
   void defaultPrivatize();
+  void doPrivatize(const Fortran::semantics::Symbol *sym);
+
   void copyLastPrivatize(mlir::Operation *op);
   void insertLastPrivateCompare(mlir::Operation *op);
   void cloneSymbol(const Fortran::semantics::Symbol *sym);
@@ -182,10 +205,20 @@ class DataSharingProcessor {
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
                        const Fortran::parser::OmpClauseList &opClauseList,
-                       Fortran::lower::pft::Evaluation &eval)
+                       Fortran::lower::pft::Evaluation &eval,
+                       bool useDelayedPrivatization = false,
+                       Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval) {}
+        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
+        symTable(symTable) {
+    for (auto privateOp : converter.getModuleOp()
+                              .getRegion()
+                              .getOps<mlir::omp::PrivateClauseOp>()) {
+      existingPrivatizerNames.insert(privateOp.getSymName());
+    }
+  }
+
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
   // firstprivates. Step1 is performed at the place where process/processStep1
@@ -204,6 +237,10 @@ class DataSharingProcessor {
     assert(!loopIV && "Loop iteration variable already set");
     loopIV = iv;
   }
+
+  const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
+    return delayedPrivatizationInfo;
+  }
 };
 
 void DataSharingProcessor::processStep1() {
@@ -488,16 +525,15 @@ void DataSharingProcessor::collectDefaultSymbols() {
 }
 
 void DataSharingProcessor::privatize() {
+
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols) {
     if (const auto *commonDet =
             sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
       for (const auto &mem : commonDet->objects()) {
-        cloneSymbol(&*mem);
-        copyFirstPrivateSymbol(&*mem);
+        doPrivatize(&*mem);
       }
     } else {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      doPrivatize(sym);
     }
   }
 }
@@ -523,12 +559,66 @@ void DataSharingProcessor::defaultPrivatize() {
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
         !privatizedSymbols.contains(sym)) {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      doPrivatize(sym);
     }
   }
 }
 
+void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
+  if (useDelayedPrivatization) {
+    auto ip = firOpBuilder.saveInsertionPoint();
+
+    auto moduleOp = firOpBuilder.getInsertionBlock()
+                        ->getParentOp()
+                        ->getParentOfType<mlir::ModuleOp>();
+
+    firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
+                                   moduleOp.getBodyRegion().front().end());
+
+    Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+    assert(hsb && "Host symbol box not found");
+
+    mlir::Type symType = hsb.getAddr().getType();
+    mlir::Location symLoc = hsb.getAddr().getLoc();
+    std::string privatizerName = sym->name().ToString() + ".privatizer";
+
+    unsigned uniquingCounter = 0;
+    auto uniquePrivatizerName = mlir::SymbolTable::generateSymbolName<64>(
+        privatizerName,
+        [&](auto &suggestedName) {
+          return existingPrivatizerNames.count(suggestedName);
+        },
+        uniquingCounter);
+
+    auto privatizerOp = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
+        symLoc, symType, uniquePrivatizerName);
+    firOpBuilder.setInsertionPointToEnd(&privatizerOp.getBody().front());
+
+    symTable->pushScope();
+    symTable->addSymbol(*sym, privatizerOp.getArgument(0));
+    symTable->pushScope();
+
+    cloneSymbol(sym);
+    copyFirstPrivateSymbol(sym);
+
+    firOpBuilder.create<mlir::omp::YieldOp>(
+        hsb.getAddr().getLoc(), symTable->shallowLookupSymbol(*sym).getAddr());
+
+    symTable->popScope();
+    symTable->popScope();
+    firOpBuilder.restoreInsertionPoint(ip);
+
+    delayedPrivatizationInfo.privatizers.insert(
+        mlir::SymbolRefAttr::get(privatizerOp));
+    delayedPrivatizationInfo.hostAddresses.insert(hsb.getAddr());
+    delayedPrivatizationInfo.hostSymbols.insert(sym);
+    existingPrivatizerNames.insert(uniquePrivatizerName);
+  } else {
+    cloneSymbol(sym);
+    copyFirstPrivateSymbol(sym);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ClauseProcessor
 //===----------------------------------------------------------------------===//
@@ -2267,7 +2357,9 @@ static void createBodyOfOp(
     Op &op, Fortran::lower::AbstractConverter &converter, mlir::Location &loc,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     const Fortran::parser::OmpClauseList *clauses = nullptr,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &args = {},
+    std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        mlir::Operation *)>
+        genRegionEntryCB = nullptr,
     bool outerCombined = false, DataSharingProcessor *dsp = nullptr) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
@@ -2281,27 +2373,15 @@ static void createBodyOfOp(
   // argument. Also update the symbol's address with the mlir argument value.
   // e.g. For loops the argument is the induction variable. And all further
   // uses of the induction variable should use this mlir value.
-  if (args.size()) {
-    std::size_t loopVarTypeSize = 0;
-    for (const Fortran::semantics::Symbol *arg : args)
-      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-    llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
-    llvm::SmallVector<mlir::Location> locs(args.size(), loc);
-    firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
-    // The argument is not currently in memory, so make a temporary for the
-    // argument, and store it there, then bind that location to the argument.
-    mlir::Operation *storeOp = nullptr;
-    for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
-      mlir::Value indexVal =
-          fir::getBase(op.getRegion().front().getArgument(argIndex));
-      storeOp =
-          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+  auto regionArgs =
+      [&]() -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
+    if (genRegionEntryCB != nullptr) {
+      return genRegionEntryCB(op);
     }
-    firOpBuilder.setInsertionPointAfter(storeOp);
-  } else {
+
     firOpBuilder.createBlock(&op.getRegion());
-  }
+    return {};
+  }();
 
   // Mark the earliest insertion point.
   mlir::Operation *marker = insertMarker(firOpBuilder);
@@ -2399,8 +2479,8 @@ static void createBodyOfOp(
         assert(tempDsp.has_value());
         tempDsp->processStep2(op, isLoop);
       } else {
-        if (isLoop && args.size() > 0)
-          dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
+        if (isLoop && regionArgs.size() > 0)
+          dsp->setLoopIV(converter.getSymbolAddress(*regionArgs[0]));
         dsp->processStep2(op, isLoop);
       }
     }
@@ -2476,16 +2556,19 @@ static void genBodyOfTargetDataOp(
 }
 
 template <typename OpTy, typename... Args>
-static OpTy genOpWithBody(Fortran::lower::AbstractConverter &converter,
-                          Fortran::lower::pft::Evaluation &eval, bool genNested,
-                          mlir::Location currentLocation, bool outerCombined,
-                          const Fortran::parser::OmpClauseList *clauseList,
-                          Args &&...args) {
+static OpTy genOpWithBody(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &eval, bool genNested,
+    mlir::Location currentLocation, bool outerCombined,
+    const Fortran::parser::OmpClauseList *clauseList,
+    std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        mlir::Operation *)>
+        genRegionEntryCB,
+    DataSharingProcessor *dsp, Args &&...args) {
   auto op = converter.getFirOpBuilder().create<OpTy>(
       currentLocation, std::forward<Args>(args)...);
   createBodyOfOp<OpTy>(op, converter, currentLocation, eval, genNested,
-                       clauseList,
-                       /*args=*/{}, outerCombined);
+                       clauseList, genRegionEntryCB, outerCombined, dsp);
   return op;
 }
 
@@ -2493,11 +2576,12 @@ static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::MasterOp>(converter, eval, genNested,
-                                            currentLocation,
-                                            /*outerCombined=*/false,
-                                            /*clauseList=*/nullptr,
-                                            /*resultTypes=*/mlir::TypeRange());
+  return genOpWithBody<mlir::omp::MasterOp>(
+      converter, eval, genNested, currentLocation,
+      /*outerCombined=*/false,
+      /*clauseList=*/nullptr, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
+      /*resultTypes=*/mlir::TypeRange());
 }
 
 static mlir::omp::OrderedRegionOp
@@ -2507,11 +2591,14 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
       converter, eval, genNested, currentLocation,
       /*outerCombined=*/false,
-      /*clauseList=*/nullptr, /*simd=*/false);
+      /*clauseList=*/nullptr, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
+      /*simd=*/false);
 }
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
               const Fortran::parser::OmpClauseList &clauseList,
@@ -2533,8 +2620,67 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   if (!outerCombined)
     cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols);
 
+  if (!enableDelayedPrivatization) {
+    return genOpWithBody<mlir::omp::ParallelOp>(
+        converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+        /*genRegionEntryCB=*/nullptr, /*dsp=*/nullptr,
+        /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
+        numThreadsClauseOperand, allocateOperands, allocatorOperands,
+        reductionVars,
+        reductionDeclSymbols.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                   reductionDeclSymbols),
+        procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
+        /*privatizers=*/nullptr);
+  }
+
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize) {
+    dsp.processStep1();
+  }
+
+  const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();
+  llvm::SmallVector<mlir::Attribute> privatizers(
+      delayedPrivatizationInfo.privatizers.begin(),
+      delayedPrivatizationInfo.privatizers.end());
+
+  llvm::SmallVector<mlir::Value> privateSymAddresses(
+      delayedPrivatizationInfo.hostAddresses.begin(),
+      delayedPrivatizationInfo.hostAddresses.end());
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+    auto privateVars = parallelOp.getPrivateVars();
+    auto &region = parallelOp.getRegion();
+    llvm::SmallVector<mlir::Type> privateVarTypes;
+    llvm::SmallVector<mlir::Location> privateVarLocs;
+
+    for (auto privateVar : privateVars) {
+      privateVarTypes.push_back(privateVar.getType());
+      privateVarLocs.push_back(privateVar.getLoc());
+    }
+
+    converter.getFirOpBuilder().createBlock(&region, {}, privateVarTypes,
+                                            privateVarLocs);
+
+    int argIdx = 0;
+    for (const auto *sym : delayedPrivatizationInfo.hostSymbols) {
+      converter.bindSymbol(*sym, region.getArgument(argIdx));
+      ++argIdx;
+    }
+
+    return llvm::SmallVector<const Fortran::semantics::Symbol *>(
+        delayedPrivatizationInfo.hostSymbols.begin(),
+        delayedPrivatizationInfo.hostSymbols.end());
+  };
+
   return genOpWithBody<mlir::omp::ParallelOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+      genRegionEntryCB, &dsp,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -2542,7 +2688,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr);
+      procBindKindAttr, privateSymAddresses,
+      privatizers.empty()
+          ? nullptr
+          : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                 privatizers));
 }
 
 static mlir::omp::SectionOp
@@ -2554,7 +2704,9 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &sectionsClauseList);
+      /*outerCombined=*/false, &sectionsClauseList,
+      /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr);
 }
 
 static mlir::omp::SingleOp
@@ -2575,8 +2727,8 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::SingleOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &beginClauseList, allocateOperands,
-      allocatorOperands, nowaitAttr);
+      /*outerCombined=*/false, &beginClauseList, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr, allocateOperands, allocatorOperands, nowaitAttr);
 }
 
 static mlir::omp::TaskOp
@@ -2608,8 +2760,9 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TaskOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &clauseList, ifClauseOperand, finalClauseOperand,
-      untiedAttr, mergeableAttr,
+      /*outerCombined=*/false, &clauseList, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr, ifClauseOperand, finalClauseOperand, untiedAttr,
+      mergeableAttr,
       /*in_reduction_vars=*/mlir::ValueRange(),
       /*in_reductions=*/nullptr, priorityClauseOperand,
       dependTypeOperands.empty()
@@ -2631,7 +2784,8 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_taskgroup);
   return genOpWithBody<mlir::omp::TaskGroupOp>(
       converter, eval, genNested, currentLocation,
-      /*outerCombined=*/false, &clauseList,
+      /*outerCombined=*/false, &clauseList, /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
       /*task_reduction_vars=*/mlir::ValueRange(),
       /*task_reductions=*/nullptr, allocateOperands, allocatorOperands);
 }
@@ -3015,6 +3169,8 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       converter, eval, genNested, currentLocation, outerCombined, &clauseList,
+      /*genRegionEntryCB=*/nullptr,
+      /*dsp=*/nullptr,
       /*num_teams_lower=*/nullptr, numTeamsClauseOperand, ifClauseOperand,
       threadLimitClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -3211,6 +3367,33 @@ static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
   }
 }
 
+static llvm::SmallVector<const Fortran::semantics::Symbol *> genCodeForIterVar(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    const llvm::SmallVector<const Fortran::semantics::Symbol *> &args) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  auto &region = op->getRegion(0);
+
+  std::size_t loopVarTypeSize = 0;
+  for (const Fortran::semantics::Symbol *arg : args)
+    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
+  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
+  firOpBuilder.createBlock(&region, {}, tiv, locs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  mlir::Operation *storeOp = nullptr;
+  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
+    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
+    storeOp =
+        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+  }
+  firOpBuilder.setInsertionPointAfter(storeOp);
+
+  return args;
+}
+
 static void
 createSimdLoop(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::pft::Evaluation &eval,
@@ -3258,9 +3441,14 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(loopOpClauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genCodeForIterVar(op, converter, loc, iv);
+  };
+
   createBodyOfOp<mlir::omp::SimdLoopOp>(simdLoopOp, converter, loc, *nestedEval,
                                         /*genNested=*/true, &loopOpClauseList,
-                                        iv, /*outer=*/false, &dsp);
+                                        ivCallback, /*outer=*/false, &dsp);
 }
 
 static void createWsLoop(Fortran::lower::AbstractConverter &converter,
@@ -3333,8 +3521,14 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genCodeForIterVar(op, converter, loc, iv);
+  };
+
   createBodyOfOp<mlir::omp::WsLoopOp>(wsLoopOp, converter, loc, *nestedEval,
-                                      /*genNested=*/true, &beginClauseList, iv,
+                                      /*genNested=*/true, &beginClauseList,
+                                      ivCallback,
                                       /*outer=*/false, &dsp);
 }
 
@@ -3413,8 +3607,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                    loopOpClauseList,
+      genParallelOp(converter, symTable, eval, /*genNested=*/false,
+                    currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
   }
@@ -3502,8 +3696,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     genOrderedRegionOp(converter, eval, /*genNested=*/true, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, eval, /*genNested=*/true, currentLocation,
-                  beginClauseList);
+    genParallelOp(converter, symTable, eval, /*genNested=*/true,
+                  currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, eval, /*genNested=*/true, currentLocation,
@@ -3562,8 +3756,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                  beginClauseList, outerCombined);
+    genParallelOp(converter, symTable, eval, /*genNested=*/false,
+                  currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
   if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet)
@@ -3646,7 +3840,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, eval,
+    genParallelOp(converter, symTable, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
@@ -3663,6 +3857,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                                        /*genNested=*/false, currentLocation,
                                        /*outerCombined=*/false,
                                        /*clauseList=*/nullptr,
+                                       /*genRegionEntryCB=*/nullptr,
+                                       /*dsp=*/nullptr,
                                        /*reduction_vars=*/mlir::ValueRange(),
                                        /*reductions=*/nullptr, allocateOperands,
                                        allocatorOperands, nowaitClauseOperand);
diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
new file mode 100644
index 00000000000000..bb978bc1198af5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
@@ -0,0 +1,182 @@
+! TODO Convert this file into a bunch of lit tests for each conversion step.
+
+! RUN: bbc -fopenmp -emit-fir --openmp-enable-delayed-privatization -hlfir=false %s -o - 
+
+subroutine delayed_privatization()
+  integer :: var1
+  integer :: var2
+
+  var1 = 111
+  var2 = 222
+
+!$OMP PARALLEL FIRSTPRIVATE(var1, var2)
+  var1 = var1 + var2 + 2
+!$OMP END PARALLEL
+
+end subroutine
+
+! -----------------------------------------
+! ## This is what flang emits with the PoC:
+! -----------------------------------------
+!
+! ----------------------------
+! ### Conversion to FIR + OMP:
+! ----------------------------
+!module {
+!  func.func @_QPdelayed_privatization() {
+!    %0 = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1 = fir.alloca i32 {bindc_name = "var2", uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %c111_i32 = arith.constant 111 : i32
+!    fir.store %c111_i32 to %0 : !fir.ref<i32>
+!    %c222_i32 = arith.constant 222 : i32
+!    fir.store %c222_i32 to %1 : !fir.ref<i32>
+!    omp.parallel private(@var1.privatizer %0, @var2.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>) {
+!    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+!      %2 = fir.load %arg0 : !fir.ref<i32>
+!      %3 = fir.load %arg1 : !fir.ref<i32>
+!      %4 = arith.addi %2, %3 : i32
+!      %c2_i32 = arith.constant 2 : i32
+!      %5 = arith.addi %4, %c2_i32 : i32
+!      fir.store %5 to %arg0 : !fir.ref<i32>
+!      omp.terminator
+!    }
+!    return
+!  }
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var1.privatizer"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1 = fir.load %arg0 : !fir.ref<i32>
+!    fir.store %1 to %0 : !fir.ref<i32>
+!    omp.yield(%0 : !fir.ref<i32>)
+!  }) : () -> ()
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var2.privatizer"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var2", pinned, uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %1 = fir.load %arg0 : !fir.ref<i32>
+!    fir.store %1 to %0 : !fir.ref<i32>
+!    omp.yield(%0 : !fir.ref<i32>)
+!  }) : () -> ()
+!
+! -----------------------------
+! ### Conversion to LLVM + OMP:
+! -----------------------------
+!module {
+!  llvm.func @_QPdelayed_privatization() {
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var1"} : (i64) -> !llvm.ptr
+!    %2 = llvm.mlir.constant(1 : i64) : i64
+!    %3 = llvm.alloca %2 x i32 {bindc_name = "var2"} : (i64) -> !llvm.ptr
+!    %4 = llvm.mlir.constant(111 : i32) : i32
+!    llvm.store %4, %1 : i32, !llvm.ptr
+!    %5 = llvm.mlir.constant(222 : i32) : i32
+!    llvm.store %5, %3 : i32, !llvm.ptr
+!    omp.parallel private(@var1.privatizer %1, @var2.privatizer %3 : !llvm.ptr, !llvm.ptr) {
+!    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+!      %6 = llvm.load %arg0 : !llvm.ptr -> i32
+!      %7 = llvm.load %arg1 : !llvm.ptr -> i32
+!      %8 = llvm.add %6, %7  : i32
+!      %9 = llvm.mlir.constant(2 : i32) : i32
+!      %10 = llvm.add %8, %9  : i32
+!      llvm.store %10, %arg0 : i32, !llvm.ptr
+!      omp.terminator
+!    }
+!    llvm.return
+!  }
+!  "omp.private"() <{function_type = (!llvm.ptr) -> !llvm.ptr, sym_name = "var1.privatizer"}> ({
+!  ^bb0(%arg0: !llvm.ptr):
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var1", pinned} : (i64) -> !llvm.ptr
+!    %2 = llvm.load %arg0 : !llvm.ptr -> i32
+!    llvm.store %2, %1 : i32, !llvm.ptr
+!    omp.yield(%1 : !llvm.ptr)
+!  }) : () -> ()
+!  "omp.private"() <{function_type = (!llvm.ptr) -> !llvm.ptr, sym_name = "var2.privatizer"}> ({
+!  ^bb0(%arg0: !llvm.ptr):
+!    %0 = llvm.mlir.constant(1 : i64) : i64
+!    %1 = llvm.alloca %0 x i32 {bindc_name = "var2", pinned} : (i64) -> !llvm.ptr
+!    %2 = llvm.load %arg0 : !llvm.ptr -> i32
+!    llvm.store %2, %1 : i32, !llvm.ptr
+!    omp.yield(%1 : !llvm.ptr)
+!  }) : () -> ()
+!}
+!
+! --------------------------
+! ### Conversion to LLVM IR:
+! --------------------------
+!%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+!@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+!@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+
+!define void @_QPdelayed_privatization() {
+!  %structArg = alloca { ptr, ptr }, align 8
+!  %1 = alloca i32, i64 1, align 4
+!  %2 = alloca i32, i64 1, align 4
+!  store i32 111, ptr %1, align 4
+!  store i32 222, ptr %2, align 4
+!  br label %entry
+
+!entry:                                            ; preds = %0
+!  %omp_global_thread_num = call i32 @__kmpc_global_thread_num(ptr @1)
+!  br label %omp_parallel
+
+!omp_parallel:                                     ; preds = %entry
+!  %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
+!  store ptr %1, ptr %gep_, align 8
+!  %gep_2 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
+!  store ptr %2, ptr %gep_2, align 8
+!  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @_QPdelayed_privatization..omp_par, ptr %structArg)
+!  br label %omp.par.outlined.exit
+
+!omp.par.outlined.exit:                            ; preds = %omp_parallel
+!  br label %omp.par.exit.split
+
+!omp.par.exit.split:                               ; preds = %omp.par.outlined.exit
+!  ret void
+!}
+
+!; Function Attrs: nounwind
+!define internal void @_QPdelayed_privatization..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
+!omp.par.entry:
+!  %gep_ = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 0
+!  %loadgep_ = load ptr, ptr %gep_, align 8
+!  %gep_1 = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 1
+!  %loadgep_2 = load ptr, ptr %gep_1, align 8
+!  %tid.addr.local = alloca i32, align 4
+!  %1 = load i32, ptr %tid.addr, align 4
+!  store i32 %1, ptr %tid.addr.local, align 4
+!  %tid = load i32, ptr %tid.addr.local, align 4
+!  %2 = alloca i32, i64 1, align 4
+!  %3 = load i32, ptr %loadgep_, align 4
+!  store i32 %3, ptr %2, align 4
+!  %4 = alloca i32, i64 1, align 4
+!  %5 = load i32, ptr %loadgep_2, align 4
+!  store i32 %5, ptr %4, align 4
+!  br label %omp.par.region
+
+!omp.par.region:                                   ; preds = %omp.par.entry
+!  br label %omp.par.region1
+
+!omp.par.region1:                                  ; preds = %omp.par.region
+!  %6 = load i32, ptr %2, align 4
+!  %7 = load i32, ptr %4, align 4
+!  %8 = add i32 %6, %7
+!  %9 = add i32 %8, 2
+!  store i32 %9, ptr %2, align 4
+!  br label %omp.region.cont
+
+!omp.region.cont:                                  ; preds = %omp.par.region1
+!  br label %omp.par.pre_finalize
+
+!omp.par.pre_finalize:                             ; preds = %omp.region.cont
+!  br label %omp.par.outlined.exit.exitStub
+
+!omp.par.outlined.exit.exitStub:                   ; preds = %omp.par.pre_finalize
+!  ret void
+!}
+
+!; Function Attrs: nounwind
+!declare i32 @__kmpc_global_thread_num(ptr) #0
+
+!; Function Attrs: nounwind
+!declare !callback !2 void @__kmpc_fork_call(ptr, i32, ptr, ...) #0
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ca363505485773..3ee3f8fe5df8a9 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -16,6 +16,7 @@
 
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
@@ -179,7 +180,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
              Variadic<AnyType>:$allocators_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
-             OptionalAttr<ProcBindKindAttr>:$proc_bind_val);
+             OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
+             Variadic<AnyType>:$private_vars,
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
 
   let regions = (region AnyRegion:$region);
 
@@ -203,6 +206,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `private` `(`
+              custom<PrivateVarList>(
+                $private_vars, type($private_vars), $privatizers
+              ) `)`
     ) $region attr-dict
   }];
   let hasVerifier = 1;
@@ -612,7 +619,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
 def YieldOp : OpenMP_Op<"yield",
     [Pure, ReturnLike, Terminator,
      ParentOneOf<["WsLoopOp", "ReductionDeclareOp",
-     "AtomicUpdateOp", "SimdLoopOp"]>]> {
+     "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "omp.yield" yields SSA values from the OpenMP dialect op region and
@@ -1479,6 +1486,38 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
 //===----------------------------------------------------------------------===//
 // 2.14.5 target construct
 //===----------------------------------------------------------------------===//
+def PrivateClauseOp : OpenMP_Op<"private", [
+    IsolatedFromAbove, FunctionOpInterface
+  ]> {
+  let summary = "TODO";
+  let description = [{}];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<FunctionType>:$function_type);
+
+  let regions = (region AnyRegion:$body);
+
+  let builders = [OpBuilder<(ins
+    "::mlir::Type":$privateVarType,
+    "::llvm::StringRef":$privatizerName
+  )>];
+
+  let extraClassDeclaration = [{
+    ::mlir::Region *getCallableRegion() {
+      return &getBody();
+    }
+
+    /// Returns the argument types of this function.
+    ArrayRef<Type> getArgumentTypes() {
+      return getFunctionType().getInputs();
+    }
+
+    /// Returns the result types of this function.
+    ArrayRef<Type> getResultTypes() {
+      return getFunctionType().getResults();
+    }
+  }];
+}
 
 def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, MapClauseOwningOpInterface,
                                    OutlineableOpenMPOpInterface, AttrSizedOperandSegments]> {
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 730858ffc67a71..d4ccbdf6082932 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -46,6 +46,17 @@ struct RegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
                                            *this->getTypeConverter())))
       return failure();
 
+    if constexpr (std::is_same_v<OpType, mlir::omp::PrivateClauseOp>) {
+      auto llvmType = this->getTypeConverter()->convertType(
+          adaptor.getFunctionType().getInput(0));
+
+      if (!llvmType)
+        return rewriter.notifyMatchFailure(curOp,
+                                           "signature conversion failed");
+      newOp.setFunctionType(
+          FunctionType::get(rewriter.getContext(), {llvmType}, {llvmType}));
+    }
+
     rewriter.eraseOp(curOp);
     return success();
   }
@@ -231,11 +242,12 @@ void mlir::configureOpenMPToLLVMConversionLegality(
       mlir::omp::DataOp, mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp,
       mlir::omp::WsLoopOp, mlir::omp::SimdLoopOp, mlir::omp::MasterOp,
       mlir::omp::SectionOp, mlir::omp::SectionsOp, mlir::omp::SingleOp,
-      mlir::omp::TaskGroupOp, mlir::omp::TaskOp>([&](Operation *op) {
-    return typeConverter.isLegal(&op->getRegion(0)) &&
-           typeConverter.isLegal(op->getOperandTypes()) &&
-           typeConverter.isLegal(op->getResultTypes());
-  });
+      mlir::omp::TaskGroupOp, mlir::omp::TaskOp, mlir::omp::PrivateClauseOp>(
+      [&](Operation *op) {
+        return typeConverter.isLegal(&op->getRegion(0)) &&
+               typeConverter.isLegal(op->getOperandTypes()) &&
+               typeConverter.isLegal(op->getResultTypes());
+      });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp,
       mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp,
@@ -275,6 +287,7 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       RegionOpConversion<omp::SimdLoopOp>, RegionOpConversion<omp::SingleOp>,
       RegionOpConversion<omp::TaskGroupOp>, RegionOpConversion<omp::TaskOp>,
       RegionOpConversion<omp::DataOp>, RegionOpConversion<omp::TargetOp>,
+      RegionOpConversion<omp::PrivateClauseOp>,
       RegionLessOpWithVarOperandsConversion<omp::AtomicWriteOp>,
       RegionOpWithVarOperandsConversion<omp::AtomicUpdateOp>,
       RegionLessOpWithVarOperandsConversion<omp::FlushOp>,
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 2f8b3f7e11de15..889aa755d8ba46 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -420,7 +420,9 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         /* allocators_vars = */ llvm::SmallVector<Value>{},
         /* reduction_vars = */ llvm::SmallVector<Value>{},
         /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{});
+        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /*private_vars=*/mlir::ValueRange{},
+        /*privatizers=*/nullptr);
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 381f17d0804191..5d4be49369ce51 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -989,8 +989,10 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
   ParallelOp::build(
       builder, state, /*if_expr_var=*/nullptr, /*num_threads_var=*/nullptr,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
-      /*proc_bind_val=*/nullptr);
+      /*reduction_vars=*/ValueRange(),
+      /*reductions=*/nullptr,
+      /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
+      /*privatizers*/ nullptr);
   state.addAttributes(attributes);
 }
 
@@ -1594,6 +1596,93 @@ LogicalResult DataBoundsOp::verify() {
   return success();
 }
 
+void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                            Type privateVarType, StringRef privatizerName) {
+  FunctionType privatizerType = FunctionType::get(
+      odsBuilder.getContext(), {privateVarType}, {privateVarType});
+
+  build(odsBuilder, odsState, privatizerName, privatizerType);
+
+  mlir::Block &block = odsState.regions.front()->emplaceBlock();
+  block.addArgument(privateVarType, odsState.location);
+}
+
+static ParseResult parsePrivateVarList(
+    OpAsmParser &parser,
+    llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> &privateVarsOperands,
+    llvm::SmallVector<Type, 1> &privateVarsTypes, ArrayAttr &privatizersAttr) {
+  SymbolRefAttr privatizerSym;
+  OpAsmParser::UnresolvedOperand arg;
+  OpAsmParser::UnresolvedOperand blockArg;
+  Type argType;
+
+  SmallVector<SymbolRefAttr> privatizersVec;
+
+  auto parsePrivatizers = [&]() -> ParseResult {
+    if (parser.parseAttribute(privatizerSym) || parser.parseOperand(arg)) {
+      return failure();
+    }
+
+    privatizersVec.push_back(privatizerSym);
+    privateVarsOperands.push_back(arg);
+    return success();
+  };
+
+  auto parseTypes = [&]() -> ParseResult {
+    if (parser.parseType(argType))
+      return failure();
+    privateVarsTypes.push_back(argType);
+    return success();
+  };
+
+  if (parser.parseCommaSeparatedList(parsePrivatizers))
+    return failure();
+
+  SmallVector<Attribute> privatizers(privatizersVec.begin(),
+                                     privatizersVec.end());
+  privatizersAttr = ArrayAttr::get(parser.getContext(), privatizers);
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseCommaSeparatedList(parseTypes))
+    return failure();
+
+  return success();
+}
+
+static void printPrivateVarList(OpAsmPrinter &printer, Operation *op,
+                                OperandRange privateVars,
+                                TypeRange privateVarTypes,
+                                std::optional<ArrayAttr> privatizersAttr) {
+  unsigned argIndex = 0;
+  // TODO Add an op verifier instead of this assertion.
+  assert(
+      privateVars.size() == privateVarTypes.size() &&
+      ((privateVars.empty()) ||
+       (*privatizersAttr && (privatizersAttr->size() == privateVars.size()))));
+
+  for (const auto &privateVar : privateVars) {
+    assert(privatizersAttr);
+    const auto &privatizerSym = (*privatizersAttr)[argIndex];
+    printer << privatizerSym << " " << privateVar;
+
+    argIndex++;
+    if (argIndex < privateVars.size())
+      printer << ", ";
+  }
+
+  printer << " : ";
+
+  argIndex = 0;
+  for (const auto &mapType : privateVarTypes) {
+    printer << mapType;
+    argIndex++;
+    if (argIndex < privateVarTypes.size())
+      printer << ", ";
+  }
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 79956f82ed141a..4de3f32c179087 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1000,6 +1000,29 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Replace the region arguments of the parallel op (which correspond to private
+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder.
+static void prepareOmpParallel(omp::ParallelOp opInst) {
+  auto &region = opInst.getRegion();
+  auto privateVars = opInst.getPrivateVars();
+
+  auto privateVarsIt = privateVars.begin();
+  for (size_t argIdx = 0; argIdx < region.getNumArguments();
+       ++argIdx, ++privateVarsIt) {
+    for (auto &block : region) {
+      for (auto &op : block) {
+        op.replaceUsesOfWith(region.getArgument(argIdx), *privateVarsIt);
+      }
+    }
+  }
+
+  for (size_t argIdx = 0; argIdx < region.getNumArguments(); ++argIdx) {
+    region.eraseArgument(argIdx);
+  }
+}
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
@@ -1008,6 +1031,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
+  prepareOmpParallel(opInst);
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
@@ -1092,6 +1116,75 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar,
+          privInit] = [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privInits = opInst.getPrivatizers();
+        assert(privInits && privInits->size() == privVars.size());
+
+        const auto *privInitIt = privInits->begin();
+        for (auto privVarIt = privVars.begin(); privVarIt != privVars.end();
+             ++privVarIt, ++privInitIt) {
+          auto *llvmPrivVarOp = moduleTranslation.lookupValue(*privVarIt);
+          if (llvmPrivVarOp != &vPtr) {
+            continue;
+          }
+
+          auto privSym = llvm::cast<SymbolRefAttr>(*privInitIt);
+          auto privOp =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          return {*privVarIt, privOp};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      assert(privInit->getRegions().front().getNumArguments() == 1);
+
+      auto arg = privInit->getRegions().front().getArgument(0);
+      for (auto &op : privInit->getRegions().front().front()) {
+        op.replaceUsesOfWith(arg, privVar);
+      }
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      // Temporarily unlink the terminator from its parent since
+      // `inlineConvertOmpRegions` expects the insertion block to **not**
+      // contain a terminator.
+      auto &allocaTerminator = builder.GetInsertBlock()->back();
+      assert(allocaTerminator.isTerminator());
+      allocaTerminator.removeFromParent();
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(privInit->getRegion(0),
+                                         "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        // TODO proper error-handling.
+        builder.restoreIP(oldIP);
+        return codeGenIP;
+      }
+
+      allocaTerminator.insertAfter(&builder.GetInsertBlock()->back());
+
+      assert(yieldedValues.size() == 1);
+      replacementValue = yieldedValues.front();
+
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -3009,12 +3102,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 65a704d18107b5..0335e5c951f24c 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -59,7 +59,7 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%num_threads, %data_var, %data_var) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,1,1,1,0>} : (i32, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,1,1,1,0,0>} : (i32, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
@@ -68,22 +68,22 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%if_cond, %data_var, %data_var) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,1,1,0>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,1,1,0,0>} : (i1, memref<i32>, memref<i32>) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,1,0,0,0>} : (i1, i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,1,1,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
 
   return
 }
diff --git a/mlir/test/Dialect/OpenMP/roundtrip.mlir b/mlir/test/Dialect/OpenMP/roundtrip.mlir
new file mode 100644
index 00000000000000..c6e9fab6f7f98a
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/roundtrip.mlir
@@ -0,0 +1,36 @@
+// RUN: fir-opt -verify-diagnostics %s | fir-opt | FileCheck %s
+
+// CHECK-LABEL: _QPprivate_clause
+func.func @_QPprivate_clause() {
+  %0 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFprivate_clause_allocatableEx"}
+  %1 = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFprivate_clause_allocatableEy"}
+
+  // CHECK: omp.parallel private(@x.privatizer %0, @y.privatizer %1 : !fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel private(@x.privatizer %0, @y.privatizer %1: !fir.ref<i32>, !fir.ref<i32>) {
+    omp.terminator
+  }
+  return
+}
+
+// CHECK: "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "x.privatizer"}> ({
+"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "x.privatizer"}> ({
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !fir.ref<i32>):
+
+  // CHECK: %0 = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_clause_allocatableEx"}
+  %0 = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_clause_allocatableEx"}
+
+  // CHECK: omp.yield(%0 : !fir.ref<i32>)
+  omp.yield(%0 : !fir.ref<i32>)
+}) : () -> ()
+
+// CHECK: "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "y.privatizer"}> ({
+"omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "y.privatizer"}> ({
+^bb0(%arg0: !fir.ref<i32>):
+
+  // CHECK: %0 = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_clause_allocatableEy"}
+  %0 = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_clause_allocatableEy"}
+
+  // CHECK: omp.yield(%0 : !fir.ref<i32>)
+  omp.yield(%0 : !fir.ref<i32>)
+}) : () -> ()

>From 717b57ad03127edc6031c0f6ef489a9498958241 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 7 Feb 2024 04:21:42 -0600
Subject: [PATCH 2/2] Experiment with supporting delayed privatization for
 hlfir simple values

---
 flang/include/flang/Lower/SymbolMap.h         |  1 +
 flang/lib/Lower/Bridge.cpp                    |  5 +-
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 18 ++++-
 .../OpenMP/FIR/delayed_privatization.f90      |  1 +
 .../FIR/delayed_privatization_hlfir.f90       | 71 +++++++++++++++++++
 5 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/delayed_privatization_hlfir.f90

diff --git a/flang/include/flang/Lower/SymbolMap.h b/flang/include/flang/Lower/SymbolMap.h
index a55e4b133fe0a8..9f18d63ce3e7dd 100644
--- a/flang/include/flang/Lower/SymbolMap.h
+++ b/flang/include/flang/Lower/SymbolMap.h
@@ -312,6 +312,7 @@ class SymMap {
   lookupVariableDefinition(semantics::SymbolRef sym) {
     if (auto symBox = lookupSymbol(sym))
       return symBox.getIfFortranVariableOpInterface();
+
     return std::nullopt;
   }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 7a0804d57ff3ad..226792c9f346e5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1052,7 +1052,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       if (sym.detailsIf<Fortran::semantics::CommonBlockDetails>())
         return symMap->lookupSymbol(sym);
 
-      return {};
+      // With delayed privatization, Fortran symbols might now be mapped to
+      // simple `mlir::Value`s (arguments to the `omp.private` ops in this
+      // case). Therefore, it is possible that none of the above cases applies.
+      // return {};
     }
     if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
       return v;
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f89f28c006dece..e285a9a72bd9b0 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3505,6 +3505,18 @@ struct ZeroOpConversion : public FIROpConversion<fir::ZeroOp> {
   }
 };
 
+class DeclareOpConversion : public FIROpConversion<fir::DeclareOp> {
+public:
+  using FIROpConversion::FIROpConversion;
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::DeclareOp declareOp, OpAdaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(declareOp, declareOp.getMemref());
+    return mlir::success();
+  }
+};
+
 /// `fir.unreachable` --> `llvm.unreachable`
 struct UnreachableOpConversion : public FIROpConversion<fir::UnreachableOp> {
   using FIROpConversion::FIROpConversion;
@@ -3856,6 +3868,7 @@ class RenameMSVCLibmFuncs
     return mlir::success();
   }
 };
+
 } // namespace
 
 namespace {
@@ -3949,7 +3962,7 @@ class FIRToLLVMLowering
         UnboxCharOpConversion, UnboxProcOpConversion, UndefOpConversion,
         UnreachableOpConversion, UnrealizedConversionCastOpConversion,
         XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion,
-        ZeroOpConversion>(typeConverter, options);
+        ZeroOpConversion, DeclareOpConversion>(typeConverter, options);
     mlir::populateFuncToLLVMConversionPatterns(typeConverter, pattern);
     mlir::populateOpenMPToLLVMConversionPatterns(typeConverter, pattern);
     mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, pattern);
@@ -4002,7 +4015,8 @@ class FIRToLLVMLowering
       signalPassFailure();
     }
 
-    // Run pass to add comdats to functions that have weak linkage on relevant platforms
+    // Run pass to add comdats to functions that have weak linkage on relevant
+    // platforms
     if (fir::getTargetTriple(mod).supportsCOMDAT()) {
       mlir::OpPassManager comdatPM("builtin.module");
       comdatPM.addPass(mlir::LLVM::createLLVMAddComdats());
diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
index bb978bc1198af5..d17e3c6da3caf2 100644
--- a/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization.f90
@@ -3,6 +3,7 @@
 ! RUN: bbc -fopenmp -emit-fir --openmp-enable-delayed-privatization -hlfir=false %s -o - 
 
 subroutine delayed_privatization()
+  implicit none
   integer :: var1
   integer :: var2
 
diff --git a/flang/test/Lower/OpenMP/FIR/delayed_privatization_hlfir.f90 b/flang/test/Lower/OpenMP/FIR/delayed_privatization_hlfir.f90
new file mode 100644
index 00000000000000..48022d95aa6732
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed_privatization_hlfir.f90
@@ -0,0 +1,71 @@
+! TODO Convert this file into a bunch of lit tests for each conversion step.
+
+! RUN: bbc -fopenmp -emit-hlfir --openmp-enable-delayed-privatization %s -o - 
+
+subroutine delayed_privatization()
+  implicit none
+  integer :: var1
+  integer :: var2
+
+  var1 = 111
+  var2 = 222
+
+!$OMP PARALLEL FIRSTPRIVATE(var1, var2)
+  var1 = var1 + var2 + 2
+!$OMP END PARALLEL
+
+end subroutine
+
+
+! -----------------------------------------
+! ## This is what flang emits with the PoC:
+! -----------------------------------------
+!
+! ----------------------------
+! ### Conversion to HLFIR + OMP:
+! ----------------------------
+!module {
+!  func.func @_QPdelayed_privatization() {
+!    %0 = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1:2 = hlfir.declare %0 {uniq_name = "_QFdelayed_privatizationEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!    %2 = fir.alloca i32 {bindc_name = "var2", uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %3:2 = hlfir.declare %2 {uniq_name = "_QFdelayed_privatizationEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!    %c111_i32 = arith.constant 111 : i32
+!    hlfir.assign %c111_i32 to %1#0 : i32, !fir.ref<i32>
+!    %c222_i32 = arith.constant 222 : i32
+!    hlfir.assign %c222_i32 to %3#0 : i32, !fir.ref<i32>
+!    omp.parallel private(@var1.privatizer_0 %1#0, @var2.privatizer_0 %3#0 : !fir.ref<i32>, !fir.ref<i32>) {
+!    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+!      %4:2 = hlfir.declare %arg0 {uniq_name = "_QFdelayed_privatizationEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!      %5:2 = hlfir.declare %arg1 {uniq_name = "_QFdelayed_privatizationEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!      %6 = fir.load %4#0 : !fir.ref<i32>
+!      %7 = fir.load %5#0 : !fir.ref<i32>
+!      %8 = arith.addi %6, %7 : i32
+!      %c2_i32 = arith.constant 2 : i32
+!      %9 = arith.addi %8, %c2_i32 : i32
+!      hlfir.assign %9 to %4#0 : i32, !fir.ref<i32>
+!      omp.terminator
+!    }
+!    return
+!  }
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var1.privatizer_0"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatizationEvar1"}
+!    %1:2 = hlfir.declare %0 {uniq_name = "_QFdelayed_privatizationEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!    %2 = fir.load %arg0 : !fir.ref<i32>
+!    hlfir.assign %2 to %1#0 temporary_lhs : i32, !fir.ref<i32>
+!    omp.yield(%1#0 : !fir.ref<i32>)
+!  }) : () -> ()
+!  "omp.private"() <{function_type = (!fir.ref<i32>) -> !fir.ref<i32>, sym_name = "var2.privatizer_0"}> ({
+!  ^bb0(%arg0: !fir.ref<i32>):
+!    %0 = fir.alloca i32 {bindc_name = "var2", pinned, uniq_name = "_QFdelayed_privatizationEvar2"}
+!    %1:2 = hlfir.declare %0 {uniq_name = "_QFdelayed_privatizationEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!    %2 = fir.load %arg0 : !fir.ref<i32>
+!    hlfir.assign %2 to %1#0 temporary_lhs : i32, !fir.ref<i32>
+!    omp.yield(%1#0 : !fir.ref<i32>)
+!  }) : () -> ()
+!}
+!
+!
+! ### After lowring `hlfir` to `fir`, conversion to LLVM + OMP -> LLVM IR produces the exact same result as for
+! `delayed_privatization.f90`.



More information about the flang-commits mailing list