[llvm] [mlir] [mlir][OpenMP] - Implement lowering from MLIR to LLVMIR for `private` clause on `target` constructs (PR #105471)

Pranav Bhandarkar via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 20 21:49:29 PDT 2024


https://github.com/bhandarkar-pranav created https://github.com/llvm/llvm-project/pull/105471

This patch contains support to lower the `private` clause when used while offloading. We implement this by inlining the `alloc` region of `omp.private` at the start of the `target` region and adjusting the uses so that the values yielded from the `alloc` region are used in the `target` region.

At this point, we do not handle the following (the first of the two below is being worked on)
- [ ] allocatables because we do not handle `dealloc` regions in this patch.
- [ ] firstprivate because there isn't support to lower firstprivate to mlir yet.

>From 9f28204b6ab27f32105ed79a4300a116966557dc Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 14 Aug 2024 11:25:31 -0500
Subject: [PATCH] [mlir][OpenMP] - Implement lowering from MLIR to LLVMIR for
 `private` clause on `target` constructs

This patch contains support to lower the `private` clause when used while offloading.
We implement this by inlining the `alloc` region of `omp.private` at the start
of the `target` region and adjusting the uses so that the values yielded from the `alloc`
region are used in the `target` region.

At this point, we do not handle the following (the first of the two below is being worked on)
- allocatables because we do not handle `dealloc` regions in this patch.
- firstprivate because there isn't support to lower firstprivate to mlir yet.
---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   7 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 132 +++++++++++++++++-
 mlir/test/Dialect/OpenMP/invalid.mlir         |   2 +-
 .../Target/LLVMIR/openmp-target-private.mlir  |  73 ++++++++++
 .../offloading/fortran/target-private-map.f90 |  41 ++++++
 5 files changed, 252 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-private.mlir
 create mode 100644 offload/test/offloading/fortran/target-private-map.f90

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 11780f84697b15..31d97313aedb7a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2494,7 +2494,12 @@ LogicalResult PrivateClauseOp::verify() {
              << "Did not expect any values to be yielded.";
     }
 
-    if (yieldedTypes.size() == 1 && yieldedTypes.front() == symType)
+    if (yieldedTypes.size() != 1)
+      return mlir::emitError(terminator->getLoc())
+             << "Expected exactly 1 yielded value, but got "
+             << yieldedTypes.size();
+
+    if (yieldedTypes.front() == symType)
       return success();
 
     auto error = mlir::emitError(yieldOp.getLoc())
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 458d05d5059db7..af48b5379140c4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPClauseOperands.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
 #include "mlir/IR/IRMapping.h"
@@ -1480,6 +1481,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
               counter);
 
           clone.setSymName(cloneName);
+
           return {privVar, clone};
         }
       }
@@ -3241,12 +3243,140 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapVars = targetOp.getMapVars();
   llvm::Function *llvmOutlinedFn = nullptr;
-
   // TODO: It can also be false if a compile-time constant `false` IF clause is
   // specified.
   bool isOffloadEntry =
       isTargetDevice || !ompBuilder->Config.TargetTriples.empty();
 
+  if (!targetOp.getPrivateVars().empty()) {
+    auto privateVars = targetOp.getPrivateVars();
+    auto privateSyms = targetOp.getPrivateSyms();
+    auto &firstTargetRegion = opInst.getRegion(0);
+    auto &firstTargetBlock = firstTargetRegion.front();
+    auto *regionArgsStart = firstTargetBlock.getArguments().begin();
+    auto *privArgsStart = regionArgsStart + targetOp.getMapVars().size();
+    auto *privArgsEnd = privArgsStart + targetOp.getPrivateVars().size();
+    BitVector blockArgsBV(firstTargetBlock.getNumArguments(), false);
+    struct omp::PrivateClauseOps newPrivateClauses;
+    MutableArrayRef argSubRangePrivates(privArgsStart, privArgsEnd);
+    for (auto [privVar, privatizerNameAttr, blockArg] :
+         llvm::zip_equal(privateVars, *privateSyms, argSubRangePrivates)) {
+
+      SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerNameAttr);
+
+      // 1. Clone the privatizer so that we can make changes to it freely
+      MLIRContext &context = moduleTranslation.getContext();
+      mlir::IRRewriter rewriter(&context);
+      omp::PrivateClauseOp privatizer =
+          SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(&opInst,
+                                                                     privSym);
+      // Handle only private for now. Also, not handling allocatables yet.
+      if (privatizer.getDataSharingType() !=
+              omp::DataSharingClauseType::Private ||
+          !privatizer.getDeallocRegion().empty()) {
+        newPrivateClauses.privateSyms.push_back(privSym);
+        newPrivateClauses.privateVars.push_back(privVar);
+        continue;
+      }
+
+      auto &allocRegion = privatizer.getAllocRegion();
+      // `alloc` region should have only 1 argument
+      assert(allocRegion.getNumArguments() == 1 &&
+             "alloc region of omp::PrivateClauseOp should have one argument");
+      auto allocRegionArg = allocRegion.getArgument(0);
+      // Assuming we have the following targetOp and Privatizer
+      //
+      // omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+      //   %1 = alloca...
+      //   omp.yield(%1)
+      // }
+      // omp.target map(..) map(..) private(privVar) {
+      //  ^bb0(map_arg0, ..., map_argn, priv_arg0):
+      //    ...
+      //    ...
+      //   store %v, %priv_arg0
+      //   omp.terminator
+      // }
+      // Roughly, we do the following -
+      // Split the first block (bb0) of the first region of the target into
+      // two block. Then clone the alloc region of the privatizer between the
+      // two new blocks. When cloning replace the alloc argument with privVar.
+      // We'll then have
+      //
+      // omp.target map(..) map(..) private(privVar) {
+      //  ^bb0(map_arg0, ..., map_argn, priv_arg0, ..., priv_argn):
+      //  ^bb1:  (cloned region)  // no predecessor
+      //   %1 = alloca...
+      //   omp.yield(%1)
+      //  ^bb2:                   // no predecessor
+      //    ...
+      //    ...
+      //   store %v, %priv_arg0
+      //   omp.terminator
+      // }
+      // Next, add an unconditional branch from ^bb0 to ^bb1 and change the
+      // yield in the last block of the cloned alloc region to an unconditional
+      // branch before replacing all uses of 'priv_arg0' with the yielded value
+      // to finally get the following
+      //
+      // omp.target map(..) map(..) private(privVar) {
+      //  ^bb0(map_arg0, ..., map_argn, priv_arg0, ..., priv_argn):
+      //   llvm.br ^bb1
+      //  ^bb1:  (cloned region) // pred: ^bb0
+      //   %1 = alloca...
+      //   llvm.br ^bb2
+      //  ^bb2:                  // pred: ^bb1
+      //    ...
+      //    ...
+      //   store %v, %1          // %priv_arg0 replaced with the yield value
+      //   omp.terminator
+      // }
+      Block &firstBlock = firstTargetRegion.front();
+      Block *newBlock = rewriter.splitBlock(&firstBlock, firstBlock.begin());
+      rewriter.setInsertionPointToStart(&firstBlock);
+      Location loc = targetOp.getLoc();
+      rewriter.create<LLVM::BrOp>(loc, ValueRange(), newBlock);
+
+      IRMapping cloneMap;
+      cloneMap.map(allocRegionArg, privVar);
+      auto secondBlockIter = std::next(firstTargetRegion.begin(), 1);
+      allocRegion.cloneInto(&firstTargetRegion, secondBlockIter, cloneMap);
+
+      unsigned allocRegNumBlocks = allocRegion.getBlocks().size();
+      secondBlockIter = std::next(firstTargetRegion.begin(), 1);
+      auto clonedAllocRegionEndIter =
+          std::next(secondBlockIter, (allocRegNumBlocks - 1));
+      Block &clonedAllocRegEndBlock = *clonedAllocRegionEndIter;
+
+      Operation *br = firstBlock.getTerminator();
+      LLVM::BrOp brOp = dyn_cast<LLVM::BrOp>(br);
+      Operation *yield = clonedAllocRegEndBlock.getTerminator();
+      omp::YieldOp yieldOp = dyn_cast<omp::YieldOp>(yield);
+      auto newValue = yieldOp.getResults().front();
+      rewriter.setInsertionPointAfter(yield);
+      auto *oldSucc = brOp.getSuccessor();
+      brOp.setSuccessor(&*secondBlockIter);
+      // TODO:Consider cloning brOp and adding it to clonedAllocRegEngBlock
+      // TODO: Can we not simply merge clonedAllocRegEngBlock and oldSucc?
+      rewriter.create<LLVM::BrOp>(loc, ValueRange(), oldSucc);
+      blockArgsBV.set(blockArg.getArgNumber());
+      rewriter.replaceAllUsesWith(blockArg, newValue);
+      rewriter.eraseOp(yield);
+    }
+    // Do some fix ups now.
+    // First, remove the blockArguments that we just privatized
+    firstTargetBlock.eraseArguments(blockArgsBV);
+    // Then remove the private vars and privatizers that we have
+    // processed i.e privatized just now.
+    if (newPrivateClauses.privateSyms.empty()) {
+      targetOp.getPrivateVarsMutable().clear();
+      targetOp.removePrivateSymsAttr();
+    } else {
+      targetOp.setPrivateSymsAttr(mlir::ArrayAttr::get(
+          targetOp.getContext(), newPrivateClauses.privateSyms));
+      targetOp.getPrivateVarsMutable().assign(newPrivateClauses.privateVars);
+    }
+  }
   LogicalResult bodyGenStatus = success();
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   auto bodyCB = [&](InsertPointTy allocaIP,
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 1d1d93f0977588..e3267c520490a2 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2219,7 +2219,7 @@ omp.private {type = private} @x.privatizer : i32 alloc {
 
 omp.private {type = private} @x.privatizer : i32 alloc {
 ^bb0(%arg0: i32):
-  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: None}}
+  // expected-error @below {{Expected exactly 1 yielded value, but got 0}}
   omp.yield
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private.mlir b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
new file mode 100644
index 00000000000000..38c93b3f0568f8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @simple_var.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+llvm.func @target_map_single_private() attributes {fir.internal_name = "_QPtarget_map_single_private"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(2 : i32) : i32
+  llvm.store %4, %3 : i32, !llvm.ptr
+  %5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
+  omp.target map_entries(%5 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr) {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %6 = llvm.mlir.constant(10 : i32) : i32
+    %7 = llvm.load %arg0 : !llvm.ptr -> i32
+    %8 = llvm.add %7, %6 : i32
+    llvm.store %8, %arg1 : i32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+// CHECK: define internal void @__omp_offloading_fd00
+// CHECK-DAG: %[[PRIV_ALLOC:.*]] = alloca i32, i64 1, align 4
+// CHECK-DAG: %[[ADD:.*]] = add i32 {{.*}}, 10
+// CHECK-DAG: store i32 %[[ADD]], ptr %[[PRIV_ALLOC]], align 4
+
+omp.private {type = private} @n.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "n", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+llvm.func @target_map_2_privates() attributes {fir.internal_name = "_QPtarget_map_2_privates"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x f32 {bindc_name = "n"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i64) : i64
+  %5 = llvm.alloca %4 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(2 : i32) : i32
+  llvm.store %6, %5 : i32, !llvm.ptr
+  %7 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
+  omp.target map_entries(%7 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr, @n.privatizer %3 -> %arg2 : !llvm.ptr) {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
+    %8 = llvm.mlir.constant(1.100000e+01 : f32) : f32
+    %9 = llvm.mlir.constant(10 : i32) : i32
+    %10 = llvm.load %arg0 : !llvm.ptr -> i32
+    %11 = llvm.add %10, %9 : i32
+    llvm.store %11, %arg1 : i32, !llvm.ptr
+    %12 = llvm.load %arg1 : !llvm.ptr -> i32
+    %13 = llvm.sitofp %12 : i32 to f32
+    %14 = llvm.fadd %13, %8  {fastmathFlags = #llvm.fastmath<contract>} : f32
+    llvm.store %14, %arg2 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK: define internal void @__omp_offloading_fd00
+// CHECK-DAG: %[[PRIV_I32_ALLOC:.*]] = alloca i32, i64 1, align 4
+// CHECK-DAG: %[[PRIV_FLOAT_ALLOC:.*]] = alloca float, i64 1, align 4
+// CHECK-DAG: %[[ADD_I32:.*]] = add i32 {{.*}}, 10
+// CHECK-DAG: store i32 %[[ADD_I32]], ptr %[[PRIV_I32_ALLOC]], align 4
+// CHECK-DAG: %[[LOAD_I32_AGAIN:.*]] = load i32, ptr %6, align 4
+// CHECK-DAG: %[[CAST_TO_FLOAT:.*]] = sitofp i32 %[[LOAD_I32_AGAIN]] to float
+// CHECK-DAG: %[[ADD_FLOAT:.*]] = fadd contract float %[[CAST_TO_FLOAT]], 1.100000e+01
+// CHECK-DAG: store float %[[ADD_FLOAT]], ptr %[[PRIV_FLOAT_ALLOC]], align 4
+      
diff --git a/offload/test/offloading/fortran/target-private-map.f90 b/offload/test/offloading/fortran/target-private-map.f90
new file mode 100644
index 00000000000000..861a1886c03844
--- /dev/null
+++ b/offload/test/offloading/fortran/target-private-map.f90
@@ -0,0 +1,41 @@
+! Test target private
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  integer :: a = 0
+  call target_private(a)
+  print*, "======= FORTRAN Test passed! ======="
+  print*, "foo(a) should not return 20, got " , a
+  if (a /= 20) then
+     stop 0
+  else
+     stop 1
+  end if
+  
+  !       stop 0
+end program main
+subroutine target_private(r)
+  implicit none
+  integer, dimension(2) :: simple_vars
+  integer :: a, r
+  ! set a to 10
+  a = 5
+  simple_vars(1) = a
+  simple_vars(2) = a
+  !$omp target map(tofrom: simple_vars) private(a)
+  ! Without private(a), a would be firstprivate, meaning it's value would be 5
+  ! with private(a), it's value would be uninitialized, which means it'd have
+  ! a very small chance of being 5.
+  ! So, simple_vars(1|2) should be 5 + a_private
+  simple_vars(1) = simple_vars(1) + a
+  simple_vars(2) = simple_vars(2) + a
+  !$omp end target
+  r = simple_vars(1) + simple_vars(2)
+end subroutine target_private



More information about the llvm-commits mailing list