[llvm-branch-commits] [mlir] [mlir][OpenMP] Convert reduction alloc region to LLVMIR (PR #102524)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Aug 8 12:14:53 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-flang-openmp

Author: Tom Eccles (tblah)

<details>
<summary>Changes</summary>

The intention of this change is to ensure that allocas end up in the entry block not spread out amongst complex reduction variable initialization code.

The tests we have are quite minimized for readability and maintainability, making the benefits less obvious. The use case for this is when there are multiple reduction variables each will multiple blocks inside of the init region for that reduction.

2/3
Part 1: https://github.com/llvm/llvm-project/pull/102522

---
Full diff: https://github.com/llvm/llvm-project/pull/102524.diff


4 Files Affected:

- (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+99-35) 
- (modified) mlir/test/Target/LLVMIR/openmp-private.mlir (+4-2) 
- (modified) mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir (+9-5) 
- (modified) mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir (+7-5) 


``````````diff
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 458d05d5059db7..e3caf90696d699 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -594,45 +594,85 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
 
 /// Allocate space for privatized reduction variables.
 template <typename T>
-static void allocByValReductionVars(
-    T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
-    LLVM::ModuleTranslation &moduleTranslation,
-    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
-    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
-    DenseMap<Value, llvm::Value *> &reductionVariableMap,
-    llvm::ArrayRef<bool> isByRefs) {
+static LogicalResult
+allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
+                   llvm::IRBuilderBase &builder,
+                   LLVM::ModuleTranslation &moduleTranslation,
+                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                   SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                   SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+                   DenseMap<Value, llvm::Value *> &reductionVariableMap,
+                   llvm::ArrayRef<bool> isByRefs) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  // delay creating stores until after all allocas
+  SmallVector<std::pair<llvm::Value *, llvm::Value *>> storesToCreate;
+  storesToCreate.reserve(loop.getNumReductionVars());
+
   for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
-    if (isByRefs[i])
-      continue;
-    llvm::Value *var = builder.CreateAlloca(
-        moduleTranslation.convertType(reductionDecls[i].getType()));
-    moduleTranslation.mapValue(reductionArgs[i], var);
-    privateReductionVariables[i] = var;
-    reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
+    Region &allocRegion = reductionDecls[i].getAllocRegion();
+    if (isByRefs[i]) {
+      if (allocRegion.empty())
+        continue;
+
+      SmallVector<llvm::Value *, 1> phis;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc",
+                                         builder, moduleTranslation, &phis)))
+        return failure();
+      assert(phis.size() == 1 && "expected one allocation to be yielded");
+
+      builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+
+      // Allocate reduction variable (which is a pointer to the real reduction
+      // variable allocated in the inlined region)
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      storesToCreate.emplace_back(phis[0], var);
+
+      privateReductionVariables[i] = var;
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+    } else {
+      assert(allocRegion.empty() &&
+             "allocaction is implicit for by-val reduction");
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      moduleTranslation.mapValue(reductionArgs[i], var);
+      privateReductionVariables[i] = var;
+      reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
+    }
   }
+
+  // TODO: further delay this so it doesn't come in the entry block at all
+  for (auto [data, addr] : storesToCreate)
+    builder.CreateStore(data, addr);
+
+  return success();
 }
 
-/// Map input argument to all reduction initialization regions
+/// Map input arguments to reduction initialization region
 template <typename T>
 static void
-mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation,
-                     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
-                     unsigned i) {
+mapInitializationArgs(T loop, LLVM::ModuleTranslation &moduleTranslation,
+                      SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                      DenseMap<Value, llvm::Value *> &reductionVariableMap,
+                      unsigned i) {
   // map input argument to the initialization region
   mlir::omp::DeclareReductionOp &reduction = reductionDecls[i];
   Region &initializerRegion = reduction.getInitializerRegion();
   Block &entry = initializerRegion.front();
-  assert(entry.getNumArguments() == 1 &&
-         "the initialization region has one argument");
 
   mlir::Value mlirSource = loop.getReductionVars()[i];
   llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
   assert(llvmSource && "lookup reduction var");
-  moduleTranslation.mapValue(entry.getArgument(0), llvmSource);
+  moduleTranslation.mapValue(reduction.getInitializerMoldArg(), llvmSource);
+
+  if (entry.getNumArguments() > 1) {
+    llvm::Value *allocation =
+        reductionVariableMap.lookup(loop.getReductionVars()[i]);
+    moduleTranslation.mapValue(reduction.getInitializerAllocArg(), allocation);
+  }
 }
 
 /// Collect reduction info
@@ -779,18 +819,21 @@ static LogicalResult allocAndInitializeReductionVars(
   if (op.getNumReductionVars() == 0)
     return success();
 
-  allocByValReductionVars(op, reductionArgs, builder, moduleTranslation,
-                          allocaIP, reductionDecls, privateReductionVariables,
-                          reductionVariableMap, isByRef);
+  if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation,
+                                allocaIP, reductionDecls,
+                                privateReductionVariables, reductionVariableMap,
+                                isByRef)))
+    return failure();
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
   for (unsigned i = 0; i < op.getNumReductionVars(); ++i) {
-    SmallVector<llvm::Value *> phis;
+    SmallVector<llvm::Value *, 1> phis;
 
     // map block argument to initializer region
-    mapInitializationArg(op, moduleTranslation, reductionDecls, i);
+    mapInitializationArgs(op, moduleTranslation, reductionDecls,
+                          reductionVariableMap, i);
 
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
                                        "omp.reduction.neutral", builder,
@@ -799,6 +842,13 @@ static LogicalResult allocAndInitializeReductionVars(
     assert(phis.size() == 1 && "expected one value to be yielded from the "
                                "reduction neutral element declaration region");
     if (isByRef[i]) {
+      if (!reductionDecls[i].getAllocRegion().empty())
+        // done in allocReductionVars
+        continue;
+
+      // TODO: this path can be removed once all users of by-ref are updated to
+      // use an alloc region
+
       // Allocate reduction variable (which is a pointer to the real reduction
       // variable allocated in the inlined region)
       llvm::Value *var = builder.CreateAlloca(
@@ -1319,9 +1369,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(),
             opInst.getNumReductionVars());
 
-    allocByValReductionVars(opInst, reductionArgs, builder, moduleTranslation,
-                            allocaIP, reductionDecls, privateReductionVariables,
-                            reductionVariableMap, isByRef);
+    allocaIP =
+        InsertPointTy(allocaIP.getBlock(),
+                      allocaIP.getBlock()->getTerminator()->getIterator());
+
+    if (failed(allocReductionVars(opInst, reductionArgs, builder,
+                                  moduleTranslation, allocaIP, reductionDecls,
+                                  privateReductionVariables,
+                                  reductionVariableMap, isByRef)))
+      bodyGenStatus = failure();
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
@@ -1332,8 +1388,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       if (isByRef[i]) {
-        // Allocate reduction variable (which is a pointer to the real reduciton
-        // variable allocated in the inlined region)
+        if (!reductionDecls[i].getAllocRegion().empty())
+          continue;
+
+        // TODO: remove after all users of by-ref are updated to use the alloc
+        // region: Allocate reduction variable (which is a pointer to the real
+        // reduciton variable allocated in the inlined region)
         byRefVars[i] = builder.CreateAlloca(
             moduleTranslation.convertType(reductionDecls[i].getType()));
       }
@@ -1345,7 +1405,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       SmallVector<llvm::Value *> phis;
 
       // map the block argument
-      mapInitializationArg(opInst, moduleTranslation, reductionDecls, i);
+      mapInitializationArgs(opInst, moduleTranslation, reductionDecls,
+                            reductionVariableMap, i);
       if (failed(inlineConvertOmpRegions(
               reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
               builder, moduleTranslation, &phis)))
@@ -1354,11 +1415,14 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
 
-      // mapInitializationArg finishes its block with a terminator. We need to
-      // insert before that terminator.
       builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
 
       if (isByRef[i]) {
+        if (!reductionDecls[i].getAllocRegion().empty())
+          continue;
+
+        // TODO: remove after all users of by-ref are updated to use the alloc
+
         // Store the result of the inlined region to the allocated reduction var
         // ptr
         builder.CreateStore(phis[0], byRefVars[i]);
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index e76f7e4d40f7af..21167668bbee16 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -222,11 +222,13 @@ omp.private {type = private} @privatizer.part : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 
-omp.declare_reduction @reducer.part : !llvm.ptr init {
-^bb0(%arg0: !llvm.ptr):
+omp.declare_reduction @reducer.part : !llvm.ptr alloc {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
   omp.yield(%1 : !llvm.ptr)
+} init {
+^bb0(%mold: !llvm.ptr, %alloc: !llvm.ptr):
+  omp.yield(%alloc : !llvm.ptr)
 } combiner {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   omp.yield(%arg0 : !llvm.ptr)
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index 5682e7e96ab186..da6f9430046123 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -4,11 +4,15 @@
 // for array reductions. The important thing here is that we are testing a byref
 // reduction with a cleanup region, and the various regions contain multiple
 // blocks
-omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr init {
-^bb0(%arg0: !llvm.ptr):
+omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr alloc {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
   omp.yield(%1 : !llvm.ptr)
+} init {
+^bb0(%arg0: !llvm.ptr, %alloc: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  llvm.store %0, %alloc : i64, !llvm.ptr
+  omp.yield(%alloc : !llvm.ptr)
 } combiner {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   %0 = llvm.mlir.constant(0 : i64) : i64
@@ -83,6 +87,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         %[[VAL_11:.*]] = load i32, ptr %[[VAL_12:.*]], align 4
 // CHECK:         store i32 %[[VAL_11]], ptr %[[VAL_10]], align 4
 // CHECK:         %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4
+// CHECK:         %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         %[[VAL_14:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_15:.*]]
 // CHECK:       omp.reduction.init:                               ; preds = %[[VAL_16:.*]]
@@ -91,9 +98,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         br label %[[VAL_18:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_17]]
 // CHECK:         %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
-// CHECK:         %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
-// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
-// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_18]]
 // CHECK:         store i32 0, ptr %[[VAL_7]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
index ef1284547a88a7..a0ca31b7d811e3 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
@@ -1,12 +1,14 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
-  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
-  ^bb0(%arg0: !llvm.ptr):
-    %0 = llvm.mlir.constant(0 : i32) : i32
+  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr alloc {
     %1 = llvm.mlir.constant(1 : i64) : i64
     %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr
-    llvm.store %0, %2 : i32, !llvm.ptr
     omp.yield(%2 : !llvm.ptr)
+  } init {
+  ^bb0(%arg0: !llvm.ptr, %alloc: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.store %0, %alloc : i32, !llvm.ptr
+    omp.yield(%alloc : !llvm.ptr)
   } combiner {
   ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     %0 = llvm.load %arg0 : !llvm.ptr -> i32
@@ -42,8 +44,8 @@
 // Private reduction variable and its initialization.
 // CHECK: %tid.addr.local = alloca i32
 // CHECK: %[[PRIVATE:.+]] = alloca i32
-// CHECK: store i32 0, ptr %[[PRIVATE]]
 // CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]],
+// CHECK: store i32 0, ptr %[[PRIVATE]]
 
 // Call to the reduction function.
 // CHECK: call i32 @__kmpc_reduce

``````````

</details>


https://github.com/llvm/llvm-project/pull/102524


More information about the llvm-branch-commits mailing list