[Mlir-commits] [mlir] [OpenMP][LLVM] Fix access to reduction args of `omp.parallel`. (PR #96426)

Sun Jun 23 03:22:07 PDT 2024

https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/96426

Fix for Fujitsu test suite test: 0275_0032.f90. The MLIR to LLVM translation logic assumed that reduction arguments to an `omp.parallel` op are always the last set of arguments to the op. However, this is a wrong assumption since private args come afterward.

>From e2ee8b88490c1424ed533e9ff412140069421908 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 19 Jun 2024 04:16:31 -0500
Subject: [PATCH] [OpenMP][LLVM] Fix access to reduction args of
 `omp.parallel`.

Fix for Fujitsu test suite test: 0275_0032.f90. The MLIR to LLVM
translation logic assumed that reduction arguments to an `omp.parallel`
op are always the last set of arguments to the op. However, this is a
wrong assumption since private args come afterward.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  4 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 31 ++++++------
 mlir/test/Target/LLVMIR/openmp-private.mlir   | 50 +++++++++++++++++++
 3 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index f28911adccd02..633a69c5893f9 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -179,6 +179,10 @@ def ParallelOp : OpenMP_Op<"parallel", [
     OpBuilder<(ins CArg<"const ParallelClauseOps &">:$clauses)>
   ];
   let extraClassDeclaration = [{
+    unsigned getNumAllocateVars() { return getAllocateVars().size(); }
+
+    unsigned getNumAllocatorsVars() { return getAllocatorsVars().size(); }
+
     /// Returns the number of reduction variables.
     unsigned getNumReductionVars() { return getReductionVars().size(); }
   }];
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 52c780f655834..4d6c900f0fe3a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -773,8 +773,8 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void allocByValReductionVars(
-    T loop, llvm::IRBuilderBase &builder,
-    LLVM::ModuleTranslation &moduleTranslation,
+    T loop, MutableArrayRef<BlockArgument> reductionArgs,
+    llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
     llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
     SmallVectorImpl<llvm::Value *> &privateReductionVariables,
@@ -782,15 +782,13 @@ static void allocByValReductionVars(
     llvm::ArrayRef<bool> isByRefs) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
-  auto args =
-      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
 
   for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
     if (isByRefs[i])
       continue;
     llvm::Value *var = builder.CreateAlloca(
         moduleTranslation.convertType(reductionDecls[i].getType()));
-    moduleTranslation.mapValue(args[i], var);
+    moduleTranslation.mapValue(reductionArgs[i], var);
     privateReductionVariables[i] = var;
     reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
   }
@@ -925,14 +923,17 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   SmallVector<llvm::Value *> privateReductionVariables(
       wsloopOp.getNumReductionVars());
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocByValReductionVars(wsloopOp, builder, moduleTranslation, allocaIP,
-                          reductionDecls, privateReductionVariables,
+
+  MutableArrayRef<BlockArgument> reductionArgs =
+      wsloopOp.getRegion().getArguments();
+
+  allocByValReductionVars(wsloopOp, reductionArgs, builder, moduleTranslation,
+                          allocaIP, reductionDecls, privateReductionVariables,
                           reductionVariableMap, isByRef);
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
-  ArrayRef<BlockArgument> reductionArgs = wsloopOp.getRegion().getArguments();
   for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
 
@@ -1158,16 +1159,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                            reductionDecls, privateReductionVariables,
-                            reductionVariableMap, isByRef);
 
-    // Initialize reduction vars
-    builder.restoreIP(allocaIP);
     MutableArrayRef<BlockArgument> reductionArgs =
-        opInst.getRegion().getArguments().take_back(
+        opInst.getRegion().getArguments().slice(
+            opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(),
             opInst.getNumReductionVars());
 
+    allocByValReductionVars(opInst, reductionArgs, builder, moduleTranslation,
+                            allocaIP, reductionDecls, privateReductionVariables,
+                            reductionVariableMap, isByRef);
+
+    // Initialize reduction vars
+    builder.restoreIP(allocaIP);
     llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
     allocaIP =
         InsertPointTy(allocaIP.getBlock(),
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 141724eaf56f9..e8bfacdd74c32 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -184,3 +184,53 @@ llvm.mlir.global linkonce constant @_QQfoo() {addr_space = 0 : i32} : !llvm.arra
 }
 
 llvm.func @bar(!llvm.ptr)
+
+
+// Tests fix for Fujitsu test suite test: 0275_0032.f90. The MLIR to LLVM
+// translation logic assumed that reduction arguments to an `omp.parallel`
+// op are always the last set of arguments to the op. However, this is a
+// wrong assumption since private args come afterward. This tests the fix
+// that we access the different sets of args properly.
+
+// CHECK-LABEL: define internal void @private_and_reduction_..omp_par
+// CHECK:         %[[RED_ALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[PRV_ALLOC:.*]] = alloca float, i64 1, align 4
+
+// CHECK:         omp.par.region:
+// CHECK:           br label %[[PAR_REG_BEG:.*]]
+// CHECK:         [[PAR_REG_BEG]]:
+// CHECK-NEXT:      %{{.*}} = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[RED_ALLOC]], align 8
+// CHECK-NEXT:      store float 8.000000e+00, ptr %[[PRV_ALLOC]], align 4
+
+llvm.func @private_and_reduction_() attributes {fir.internal_name = "_QPprivate_and_reduction", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x f32 {bindc_name = "to_priv"} : (i64) -> !llvm.ptr
+  omp.parallel reduction(byref @reducer.part %1 -> %arg0 : !llvm.ptr) private(@privatizer.part %2 -> %arg1 : !llvm.ptr) {
+    %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %4 = llvm.mlir.constant(8.000000e+00 : f32) : f32
+    llvm.store %4, %arg1 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+omp.private {type = private} @privatizer.part : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "to_priv", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+
+omp.declare_reduction @reducer.part : !llvm.ptr init {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}  cleanup {
+^bb0(%arg0: !llvm.ptr):
+  omp.yield
+}