[flang] [llvm] [mlir] [flang][OpenMP] Extend delayed privatization for `omp.simd` (PR #122156)

Wed Jan 8 11:06:18 PST 2025

https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/122156

Adds support for delayed privatization for `simd` directives. This PR includes PFT down to LLVM IR lowering.

>From a690dc9d565c2bf77a86face6319343e306eb5b8 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 7 Jan 2025 04:31:41 -0600
Subject: [PATCH] [flang][OpenMP] Extend delayed privatization for `omp.simd`

Adds support for delayed privatization for `simd` directives. This PR
includes PFT down to LLVM IR lowering.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   8 +-
 flang/test/Lower/OpenMP/order-clause.f90      |   6 +-
 .../Lower/OpenMP/parallel-private-clause.f90  |  32 +++--
 flang/test/Lower/OpenMP/simd.f90              |  28 ++---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   7 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   2 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  25 +++-
 .../Target/LLVMIR/openmp-simd-private.mlir    | 117 ++++++++++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  19 ---
 9 files changed, 182 insertions(+), 62 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-private.mlir

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index cd4b25a17722c1..c71fd598d5c8a4 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2144,11 +2144,10 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-  // TODO: Support delayed privatization.
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
-                           /*useDelayedPrivatization=*/false, symTable);
-  dsp.processStep1();
+                           enableDelayedPrivatization, symTable);
+  dsp.processStep1(&simdClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
@@ -2156,7 +2155,8 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                      loopNestClauseOps, iv);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private syms and vars.
+  simdArgs.priv.syms = dsp.getDelayedPrivSymbols();
+  simdArgs.priv.vars = simdClauseOps.privateVars;
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index 717d9740c56f80..a30d82979021da 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -4,15 +4,15 @@
 
 !CHECK-LABEL:   func.func @_QPsimd_order() {
 subroutine simd_order
-   !CHECK: omp.simd order(reproducible:concurrent) {
+   !CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
    !$omp simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.simd order(reproducible:concurrent) {
+   !CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
    !$omp simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.simd order(unconstrained:concurrent) {
+   !CHECK: omp.simd order(unconstrained:concurrent) private({{.*}}) {
    !$omp simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 8dc843fc9d96ae..7114314df05d3a 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -5,6 +5,23 @@
 ! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-hlfir %s -o - \
 ! RUN: | FileCheck %s --check-prefix=FIRDialect
 
+! FIRDialect: omp.private {type = private} @_QFsimd_loop_1Er_private_ref_box_heap_f32 {{.*}} alloc {
+! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! FIRDialect:     omp.yield([[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>)
+! FIRDialect:   } dealloc {
+! FIRDialect:  ^bb0([[R_DECL:%.*]]: !fir.ref<!fir.box<!fir.heap<f32>>>):
+! FIRDialect:     {{%.*}} = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     fir.if {{%.*}} {
+! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+! FIRDialect:     fir.store {{%.*}} to [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIRDialect:     omp.yield
+! FIRDialect:   }
+
 !FIRDialect: omp.private {type = private} @[[DERIVED_PRIVATIZER:_QFprivate_clause_derived_typeEt_private_ref_rec__QFprivate_clause_derived_typeTmy_type]] : !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> alloc {
 !FIRDialect:   ^bb0(%{{.*}}: !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>):
 !FIRDialect:     %[[PRIV_ALLOC:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "_QFprivate_clause_derived_typeEt"}
@@ -246,7 +263,6 @@ subroutine parallel_pointer()
 !$omp end parallel
 end subroutine parallel_pointer
 
-
 !FIRDialect-LABEL: func @_QPsimple_loop_1()
 subroutine simple_loop_1
   integer :: i
@@ -354,20 +370,17 @@ subroutine simple_loop_3
   ! FIRDialect:  omp.terminator
 end subroutine
 
+
 !CHECK-LABEL: func @_QPsimd_loop_1()
 subroutine simd_loop_1
   integer :: i
   real, allocatable :: r;
-  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect: omp.simd {
+  ! FIRDialect: omp.simd private({{.*}}) {
   ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
@@ -378,10 +391,5 @@ subroutine simd_loop_1
   end do
   !$OMP END SIMD
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+
 end subroutine
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index d92f06cebfdbe5..0345ace24aaa04 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -13,7 +13,7 @@ subroutine simd
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i=1, 9
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -33,7 +33,7 @@ subroutine simd_with_if_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd if(%[[COND:.*]]) {
+  ! CHECK: omp.simd if(%[[COND:.*]]) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -52,7 +52,7 @@ subroutine simd_with_simdlen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK: omp.simd simdlen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -72,7 +72,7 @@ subroutine simd_with_simdlen_clause_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK: omp.simd simdlen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -92,7 +92,7 @@ subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(6) {
+  ! CHECK: omp.simd simdlen(6) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -111,7 +111,7 @@ subroutine simd_with_safelen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) {
+  ! CHECK: omp.simd safelen(2) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -131,7 +131,7 @@ subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(6) {
+  ! CHECK: omp.simd safelen(6) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -150,7 +150,7 @@ subroutine simd_with_simdlen_safelen_clause(n, threshold)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) simdlen(1) {
+  ! CHECK: omp.simd safelen(2) simdlen(1) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
@@ -171,7 +171,7 @@ subroutine simd_with_collapse_clause(n)
   ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
   ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd {
+  ! CHECK: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
   ! CHECK-SAME:                %[[LOWER_I]], %[[LOWER_J]]) to (
   ! CHECK-SAME:                %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
@@ -235,7 +235,7 @@ subroutine simd_with_nontemporal_clause(n)
   !CHECK: %[[LB:.*]] = arith.constant 1 : i32
   !CHECK: %[[UB:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
   !CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  !CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) {
+  !CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) private({{.*}}) {
   !CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD NONTEMPORAL(A, C)
   do i = 1, n
@@ -249,16 +249,14 @@ subroutine lastprivate_with_simd
 
 !CHECK: %[[VAR_SUM:.*]] = fir.alloca f32 {bindc_name = "sum", uniq_name = "_QFlastprivate_with_simdEsum"}
 !CHECK: %[[VAR_SUM_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM]] {{.*}}
-!CHECK: %[[VAR_SUM_PINNED:.*]] = fir.alloca f32 {bindc_name = "sum", pinned, uniq_name = "_QFlastprivate_with_simdEsum"}
-!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
-
   implicit none
   integer :: i
   real :: sum
 
   
-!CHECK: omp.simd {
+!CHECK: omp.simd private(@_QFlastprivate_with_simdEsum_private_ref_f32 %[[VAR_SUM_DECLARE]]#0 -> %[[VAR_SUM_PINNED:.*]], @{{.*}}) {
 !CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = ({{.*}} to ({{.*}}) inclusive step ({{.*}}) {
+!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
 !CHECK: %[[ADD_RESULT:.*]] = arith.addi {{.*}}
 !CHECK: %[[ADD_RESULT_CONVERT:.*]] = fir.convert %[[ADD_RESULT]] : (i32) -> f32
 !CHECK: hlfir.assign %[[ADD_RESULT_CONVERT]] to %[[VAR_SUM_PINNED_DECLARE]]#0 : f32, !fir.ref<f32>
@@ -283,7 +281,7 @@ subroutine simd_with_reduction_clause
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.simd private({{.*}}) reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$omp simd reduction(+:x)
   do i=1, 9
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8dbf2aa7e0a243..12b032574a07a3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5207,12 +5207,7 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
   Function *F = CanonicalLoop->getFunction();
 
   // Define where if branch should be inserted
-  Instruction *SplitBefore;
-  if (Instruction::classof(IfCond)) {
-    SplitBefore = dyn_cast<Instruction>(IfCond);
-  } else {
-    SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
-  }
+  Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
 
   // TODO: We should not rely on pass manager. Currently we use pass manager
   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 3d62b3218869ea..ca7e08e9f18b5f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2116,7 +2116,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
                 makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
-                /*private_vars=*/{}, /*private_syms=*/nullptr,
+                clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms),
                 clauses.reductionVars,
                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
                 makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 87cb7f03fec6aa..c055f84092685c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -273,7 +273,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       .Case([&](omp::SimdOp op) {
         checkLinear(op, result);
         checkNontemporal(op, result);
-        checkPrivate(op, result);
         checkReduction(op, result);
       })
       .Case<omp::AtomicReadOp, omp::AtomicWriteOp, omp::AtomicUpdateOp,
@@ -2221,8 +2220,28 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
+  MutableArrayRef<BlockArgument> privateBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(*simdOp).getPrivateBlockArgs();
+  SmallVector<mlir::Value> mlirPrivateVars;
+  SmallVector<llvm::Value *> llvmPrivateVars;
+  SmallVector<omp::PrivateClauseOp> privateDecls;
+  mlirPrivateVars.reserve(privateBlockArgs.size());
+  llvmPrivateVars.reserve(privateBlockArgs.size());
+  collectPrivatizationDecls(simdOp, privateDecls);
+
+  for (mlir::Value privateVar : simdOp.getPrivateVars())
+    mlirPrivateVars.push_back(privateVar);
+
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
+  llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
+      builder, moduleTranslation, privateBlockArgs, privateDecls,
+      mlirPrivateVars, llvmPrivateVars, allocaIP);
+  if (handleError(afterAllocas, opInst).failed())
+    return failure();
+
   // Generator of the canonical loop body.
   SmallVector<llvm::CanonicalLoopInfo *> loopInfos;
   SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> bodyInsertPoints;
@@ -2322,7 +2341,9 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                         order, simdlen, safelen);
 
   builder.restoreIP(afterIP);
-  return success();
+
+  return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
+                            llvmPrivateVars, privateDecls);
 }
 
 /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-private.mlir b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
new file mode 100644
index 00000000000000..09d76f8edd0074
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
@@ -0,0 +1,117 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+omp.private {type = private} @i_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+// CHECK-LABEL: test_loop_var_privatization()
+//                Original (non-privatized) allocation for `i`.
+// CHECK:         %{{.*}} = alloca i32, i64 1, align 4
+// CHECK:         %[[DUMMY:.*]] = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_I:.*]] = alloca i32, i64 1, align 4
+// CHECK:         br label %[[AFTER_ALLOC:.*]]
+
+// CHECK:       [[AFTER_ALLOC]]:
+// CHECK:         br label %[[ENTRY:.*]]
+
+// CHECK:       [[ENTRY]]:
+// CHECK:         br label %[[OMP_LOOP_PREHEADER:.*]]
+
+// CHECK:       [[OMP_LOOP_PREHEADER]]:
+// CHECK:         br label %[[OMP_LOOP_HEADER:.*]]
+
+// CHECK:       [[OMP_LOOP_HEADER]]:
+// CHECK:         %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %[[OMP_LOOP_PREHEADER]] ], [ %[[OMP_LOOP_NEXT:.*]], %[[OMP_LOOP_INC:.*]] ]
+// CHECK:         br label %[[OMP_LOOP_COND:.*]]
+
+// CHECK:       [[OMP_LOOP_COND]]:
+// CHECK:         %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], 10
+// CHECK:         br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.*]], label %[[OMP_LOOP_EXIT:.*]]
+
+// CHECK:       [[OMP_LOOP_BODY]]:
+// CHECK:         %[[IV_UPDATE:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
+// CHECK:         %[[IV_UPDATE_2:.*]] = add i32 %[[IV_UPDATE]], 1
+// CHECK:         br label %[[OMP_SIMD_REGION:.*]]
+
+// CHECK:       [[OMP_SIMD_REGION]]:
+// CHECK:         store i32 %[[IV_UPDATE_2]], ptr %[[PRIV_I]], align 4
+// CHECK:         %[[DUMMY_VAL:.*]] = load float, ptr %[[DUMMY]], align 4
+// CHECK:         %[[PRIV_I_VAL:.*]] = load i32, ptr %[[PRIV_I]], align 4
+// CHECK:         %[[PRIV_I_VAL_FLT:.*]] = sitofp i32 %[[PRIV_I_VAL]] to float
+// CHECK:         %[[DUMMY_VAL_UPDATE:.*]] = fadd {{.*}} float %[[DUMMY_VAL]], %[[PRIV_I_VAL_FLT]]
+// CHECK:         store float %[[DUMMY_VAL_UPDATE]], ptr %[[DUMMY]], align 4, !llvm.access.group !1
+// CHECK:        br label %[[OMP_REGION_CONT:.*]]
+
+// CHECK:      [[OMP_REGION_CONT]]:
+// CHECK:        br label %[[OMP_LOOP_INC:.*]]
+
+// CHECK:      [[OMP_LOOP_INC]]:
+// CHECK:        %[[OMP_LOOP_NEXT:.*]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK:        br label %[[OMP_LOOP_HEADER]]
+
+// CHECK:      [[OMP_LOOP_EXIT]]:
+
+
+llvm.func @test_loop_var_privatization() attributes {fir.internal_name = "_QPtest_private_clause"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x f32 {bindc_name = "dummy"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(10 : i32) : i32
+  %5 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd private(@i_privatizer %1 -> %arg0 : !llvm.ptr) {
+    omp.loop_nest (%arg1) : i32 = (%5) to (%4) inclusive step (%5) {
+      llvm.store %arg1, %arg0 : i32, !llvm.ptr
+      %8 = llvm.load %3 : !llvm.ptr -> f32
+      %9 = llvm.load %arg0 : !llvm.ptr -> i32
+      %10 = llvm.sitofp %9 : i32 to f32
+      %11 = llvm.fadd %8, %10 {fastmathFlags = #llvm.fastmath<contract>} : f32
+      llvm.store %11, %3 : f32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+omp.private {type = private} @dummy_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "dummy", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+// CHECK-LABEL: test_private_clause()
+//                Original (non-privatized) allocation for `i`.
+// CHECK:         %{{.*}} = alloca i32, i64 1, align 4
+//                Original (non-privatized) allocation for `dummy`.
+// CHECK:         %{{.*}} = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_DUMMY:.*]] = alloca float, i64 1, align 4
+// CHECK:         %[[PRIV_I:.*]] = alloca i32, i64 1, align 4
+
+// CHECK:       omp.simd.region:
+// CHECK-NOT:     br label
+// CHECK:         store i32 %{{.*}}, ptr %[[PRIV_I]], align 4
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_DUMMY]], align 4
+// CHECK:        store float %{{.*}}, ptr %[[PRIV_DUMMY]], align 4
+
+llvm.func @test_private_clause() attributes {fir.internal_name = "_QPtest_private_clause"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x f32 {bindc_name = "dummy"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(10 : i32) : i32
+  %5 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd private(@dummy_privatizer %3 -> %arg0, @i_privatizer %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2) : i32 = (%5) to (%4) inclusive step (%5) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %8 = llvm.load %arg0 : !llvm.ptr -> f32
+      %9 = llvm.load %arg1 : !llvm.ptr -> i32
+      %10 = llvm.sitofp %9 : i32 to f32
+      %11 = llvm.fadd %8, %10 {fastmathFlags = #llvm.fastmath<contract>} : f32
+      llvm.store %11, %arg0 : f32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 83a0990d631620..f8c9d911e3034c 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -156,25 +156,6 @@ llvm.func @simd_nontemporal(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
-^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
-  omp.yield(%1 : !llvm.ptr)
-}
-llvm.func @simd_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error at below {{not yet implemented: Unhandled clause privatization in omp.simd operation}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.simd}}
-  omp.simd private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
-    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
-
-// -----
-
 omp.declare_reduction @add_f32 : f32
 init {
 ^bb0(%arg: f32):