[flang-commits] [flang] [flang] Support `do concurrent ... reduce` for associating names (PR #148597)

Mon Jul 14 02:27:36 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Kareem Ergawy (ergawy)

<details>
<summary>Changes</summary>

Extends reduction support for `do concurrent`, in particular, for associating names. Consider the following input:
```fortran
subroutine dc_associate_reduce
  integer :: i
  real, allocatable, dimension(:) :: x

  associate(x_associate => x)
  do concurrent (i = 1:10) reduce(+: x_associate)
  end do
  end associate
end subroutine
```

The declaration of `x_associate` is emitted as follows:
```mlir
%13:2 = hlfir.declare %10(%12) {uniq_name = "...."} : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
```
where the HLFIR base type is an array descriptor (i.e. the allocatable/heap attribute is dropped as stipulated by the spec; section 11.1.3.3).

The problem here is that `declare_reduction` ops accept only reference types. This restriction is alreay partially handed for `fir::BaseBoxType`'s by allocating a stack slot for the descriptor and storing the box in that stack allocation. We have to modify this a littble bit for `associate` since the HLFIR and FIR base types are different (unlike most scenarios).

---
Full diff: https://github.com/llvm/llvm-project/pull/148597.diff


7 Files Affected:

- (modified) flang/lib/Lower/Support/ReductionProcessor.cpp (+28-6) 
- (modified) flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 (+2-2) 
- (modified) flang/test/Lower/OpenMP/sections-array-reduction.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 (+1-1) 
- (added) flang/test/Lower/do_concurrent_reduce_associate.f90 (+20) 


``````````diff

diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 14b2c9836748f..ddcecd8c1bb02 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -633,13 +633,25 @@ void ReductionProcessor::processReductionArguments(
     }
   }
 
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-
   // Reduction variable processing common to both intrinsic operators and
   // procedure designators
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::OpBuilder::InsertPoint dcIP;
+  constexpr bool isDoConcurrent =
+      std::is_same_v<OpType, fir::DeclareReductionOp>;
+
+  if (isDoConcurrent) {
+    dcIP = builder.saveInsertionPoint();
+    builder.setInsertionPoint(
+        builder.getRegion().getParentOfType<fir::DoConcurrentOp>());
+  }
+
   for (const semantics::Symbol *symbol : reductionSymbols) {
     mlir::Value symVal = converter.getSymbolAddress(*symbol);
+
+    if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+      symVal = declOp.getBase();
+
     mlir::Type eleType;
     auto refType = mlir::dyn_cast_or_null<fir::ReferenceType>(symVal.getType());
     if (refType)
@@ -667,13 +679,20 @@ void ReductionProcessor::processReductionArguments(
       // boxed arrays are passed as values not by reference. Unfortunately,
       // we can't pass a box by value to omp.redution_declare, so turn it
       // into a reference
+      auto oldIP = builder.saveInsertionPoint();
+      builder.setInsertionPointToStart(builder.getAllocaBlock());
+      auto alloca =
+          builder.create<fir::AllocaOp>(currentLocation, symVal.getType());
+      builder.restoreInsertionPoint(oldIP);
+      builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
+      symVal = alloca;
+    }
 
+    if (mlir::isa<fir::BaseBoxType>(symVal.getType())) {
       auto alloca =
           builder.create<fir::AllocaOp>(currentLocation, symVal.getType());
       builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
       symVal = alloca;
-    } else if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>()) {
-      symVal = declOp.getBase();
     }
 
     // this isn't the same as the by-val and by-ref passing later in the
@@ -693,7 +712,7 @@ void ReductionProcessor::processReductionArguments(
   unsigned idx = 0;
   for (auto [symVal, isByRef] : llvm::zip(reductionVars, reduceVarByRef)) {
     auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
-    const auto &kindMap = firOpBuilder.getKindMap();
+    const auto &kindMap = builder.getKindMap();
     std::string reductionName;
     ReductionIdentifier redId;
 
@@ -745,9 +764,12 @@ void ReductionProcessor::processReductionArguments(
     OpType decl = createDeclareReduction<OpType>(
         converter, reductionName, redId, redType, currentLocation, isByRef);
     reductionDeclSymbols.push_back(
-        mlir::SymbolRefAttr::get(firOpBuilder.getContext(), decl.getSymName()));
+        mlir::SymbolRefAttr::get(builder.getContext(), decl.getSymName()));
     ++idx;
   }
+
+  if (isDoConcurrent)
+    builder.restoreInsertionPoint(dcIP);
 }
 
 const semantics::SourceName
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
index a5710fcf5352b..ec54294c7104f 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -69,6 +69,7 @@ program reduce
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
@@ -76,7 +77,6 @@ program reduce
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape_shift %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3x2xi32>>, !fir.shapeshift<2>) -> (!fir.box<!fir.array<3x2xi32>>, !fir.ref<!fir.array<3x2xi32>>)
-! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
 ! CHECK:           fir.store %[[VAL_6]]#0 to %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>
 ! CHECK:           omp.parallel reduction(byref @add_reduction_byref_box_3x2xi32 %[[VAL_7]] -> %[[VAL_8:.*]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>) {
 ! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3x2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3x2xi32>>>, !fir.ref<!fir.box<!fir.array<3x2xi32>>>)
diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
index 0cf88cf889868..104904497745d 100644
--- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
+++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
@@ -64,11 +64,11 @@ subroutine max_array_reduction(l, r)
 ! CHECK-LABEL:   func.func @_QPmax_array_reduction(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "l"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "r"}) {
+! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEr"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-! CHECK:           fir.store %[[VAL_3]]#1 to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           fir.store %[[VAL_3]]#0 to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:           omp.parallel reduction(byref @max_byref_box_Uxi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
 ! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90
index 91e0680692637..2f2808cebfc0c 100644
--- a/flang/test/Lower/OpenMP/sections-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90
@@ -34,7 +34,7 @@ subroutine sectionsReduction(x)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
-! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:             fir.store %[[VAL_2]]#0 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:             omp.sections reduction(byref @add_reduction_byref_box_Uxf32 %[[VAL_3]] -> %[[VAL_4:.*]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) {
 ! CHECK:               omp.section {
 ! CHECK:               ^bb0(%[[VAL_5:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
index 18d45217272fc..18a4f75b86309 100644
--- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
@@ -22,7 +22,7 @@
 ! CHECK-SAME:      {uniq_name = "_QFtask_reductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          omp.parallel {
 ! CHECK:            %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
-! CHECK:            fir.store %[[VAL_2]]#1 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:            fir.store %[[VAL_2]]#0 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:            omp.taskgroup task_reduction(byref @add_reduction_byref_box_Uxf32 %[[VAL_3]] -> %[[VAL_4:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>) {
 ! CHECK:              %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] 
 ! CHECK-SAME:         {uniq_name = "_QFtask_reductionEx"} : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> (!fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 290f9e1981361..d8c5706b912a5 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -81,7 +81,7 @@ subroutine reduce(r)
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
-! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:             fir.store %[[VAL_3]]#0 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
diff --git a/flang/test/Lower/do_concurrent_reduce_associate.f90 b/flang/test/Lower/do_concurrent_reduce_associate.f90
new file mode 100644
index 0000000000000..828bd89e75c7b
--- /dev/null
+++ b/flang/test/Lower/do_concurrent_reduce_associate.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+
+subroutine dc_associate_reduce
+  integer :: i
+  real, allocatable, dimension(:) :: x
+
+  associate(x_associate => x)
+  do concurrent (i = 1:10) reduce(+: x_associate)
+  end do
+  end associate
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdc_associate_reduce() {
+! CHECK:         %[[BOX_ALLOC:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+! CHECK:         %[[ASSOC_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "{{.*}}x_associate"}
+! CHECK:         fir.store %[[ASSOC_DECL]]#0 to %[[BOX_ALLOC]]
+! CHECK-NEXT:    fir.do_concurrent {
+! CHECK:           fir.do_concurrent.loop {{.*}} reduce(byref @{{.*}} #fir.reduce_attr<add> %[[BOX_ALLOC]] -> %{{.*}} : !{{.*}}) {
+! CHECK:         }
+! CHECK:       }

``````````

</details>


https://github.com/llvm/llvm-project/pull/148597