[llvm-branch-commits] [mlir] [flang] [mlir][flang][openmp] Rework wsloop reduction operations (PR #80019)

Mon Feb 5 03:26:18 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: David Truby (DavidTruby)

<details>
<summary>Changes</summary>

This patch reworks the way that wsloop reduction operations function to better match the expected semantics from the OpenMP specification, following the rework of parallel reductions.

The new semantics create a private reduction variable as a block argument which should be used normally for all operations on that variable in the region; this private variable is then combined with the others into the shared variable. This way no special omp.reduction operations are needed inside the region. These block arguments follow the loop control block arguments.

---

Patch is 361.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80019.diff


37 Files Affected:

- (modified) flang/lib/Lower/OpenMP.cpp (+37-19) 
- (modified) flang/test/Fir/convert-to-llvm-openmp-and-fir.fir (+16-4) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 (+266-166) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90 (+5-3) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90 (+5-3) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90 (+4-2) 
- (removed) flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-and.f90 (-137) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90 (+138-93) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90 (+140-93) 
- (removed) flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-or.f90 (-137) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 (+8-5) 
- (modified) flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 (+8-4) 
- (removed) flang/test/Lower/OpenMP/FIR/wsloop-reduction-mul.f90 (-274) 
- (modified) flang/test/Lower/OpenMP/default-clause.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 (+37-28) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-add.f90 (+312-187) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 (+40-24) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 (+6-3) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 (+41-24) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 (+158-97) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 (+153-96) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 (+158-96) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 (+155-96) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90 (+2-1) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 (+41-19) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-max.f90 (+105-48) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-min.f90 (+107-48) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-mul.f90 (+282-186) 
- (modified) mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td (+3-6) 
- (modified) mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp (+31-1) 
- (modified) mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp (+90-11) 
- (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+14-24) 
- (modified) mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir (+10-4) 
- (modified) mlir/test/Conversion/SCFToOpenMP/reductions.mlir (+22-8) 
- (modified) mlir/test/Dialect/OpenMP/invalid.mlir (+3-32) 
- (modified) mlir/test/Dialect/OpenMP/ops.mlir (+17-9) 
- (modified) mlir/test/Target/LLVMIR/openmp-reduction.mlir (+30-14) 


``````````diff

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index fcf10b26c135b..74cd6c27b3440 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2274,6 +2274,12 @@ static void createBodyOfOp(
     return undef.getDefiningOp();
   };
 
+  llvm::SmallVector<mlir::Type> blockArgTypes;
+  llvm::SmallVector<mlir::Location> blockArgLocs;
+  blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
+  blockArgLocs.reserve(blockArgTypes.size());
+  mlir::Block *entryBlock;
+
   // If an argument for the region is provided then create the block with that
   // argument. Also update the symbol's address with the mlir argument value.
   // e.g. For loops the argument is the induction variable. And all further
@@ -2283,11 +2289,21 @@ static void createBodyOfOp(
     for (const Fortran::semantics::Symbol *arg : loopArgs)
       loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
     mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-    llvm::SmallVector<mlir::Type> tiv(loopArgs.size(), loopVarType);
-    llvm::SmallVector<mlir::Location> locs(loopArgs.size(), loc);
-    firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
-    // The argument is not currently in memory, so make a temporary for the
-    // argument, and store it there, then bind that location to the argument.
+    std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
+                loopVarType);
+    std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
+  }
+  if (reductionArgs.size()) {
+    llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
+    std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
+  }
+
+  entryBlock = firOpBuilder.createBlock(&op.getRegion(), {}, blockArgTypes,
+                                        blockArgLocs);
+
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  if (loopArgs.size()) {
     mlir::Operation *storeOp = nullptr;
     for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
       mlir::Value indexVal =
@@ -2296,16 +2312,12 @@ static void createBodyOfOp(
           createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
     }
     firOpBuilder.setInsertionPointAfter(storeOp);
-  } else if (reductionArgs.size()) {
-    llvm::SmallVector<mlir::Location> locs(reductionArgs.size(), loc);
-    auto block =
-        firOpBuilder.createBlock(&op.getRegion(), {}, reductionTypes, locs);
-    for (auto [arg, prv] :
-         llvm::zip_equal(reductionArgs, block->getArguments())) {
-      converter.bindSymbol(*arg, prv);
-    }
-  } else {
-    firOpBuilder.createBlock(&op.getRegion());
+  }
+  // Bind the reduction arguments to their block arguments
+  for (auto [arg, prv] : llvm::zip_equal(
+           reductionArgs,
+           llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
+    converter.bindSymbol(*arg, prv);
   }
 
   // Mark the earliest insertion point.
@@ -3293,6 +3305,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> linearVars, linearStepVars;
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
   mlir::omp::ClauseOrderKindAttr orderClauseOperand;
   mlir::omp::ClauseScheduleKindAttr scheduleValClauseOperand;
   mlir::UnitAttr nowaitClauseOperand, scheduleSimdClauseOperand;
@@ -3304,7 +3317,8 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
-  cp.processReduction(loc, reductionVars, reductionDeclSymbols);
+  cp.processReduction(loc, reductionVars, reductionDeclSymbols,
+                      &reductionSymbols);
   cp.processTODO<Fortran::parser::OmpClause::Linear,
                  Fortran::parser::OmpClause::Order>(loc, ompDirective);
 
@@ -3347,9 +3361,14 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
+  llvm::SmallVector<mlir::Type> reductionTypes;
+  reductionTypes.reserve(reductionVars.size());
+  llvm::transform(reductionVars, std::back_inserter(reductionTypes),
+                  [](mlir::Value v) { return v.getType(); });
   createBodyOfOp<mlir::omp::WsLoopOp>(wsLoopOp, converter, loc, *nestedEval,
                                       /*genNested=*/true, &beginClauseList, iv,
-                                      /*outer=*/false, &dsp);
+                                      /*outer=*/false, &dsp, reductionSymbols,
+                                      reductionTypes);
 }
 
 static void createSimdWsLoop(
@@ -3450,12 +3469,11 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     // 2.9.3.1 SIMD construct
     createSimdLoop(converter, eval, ompDirective, loopOpClauseList,
                    currentLocation);
+    genOpenMPReduction(converter, loopOpClauseList);
   } else {
     createWsLoop(converter, eval, ompDirective, loopOpClauseList, endClauseList,
                  currentLocation);
   }
-
-  genOpenMPReduction(converter, loopOpClauseList);
 }
 
 static void
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 6efa4d0a09586..d6efdcc98b273 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -701,10 +701,17 @@ func.func @_QPsb() {
 // CHECK-SAME: %[[ARRAY_REF:.*]]: !llvm.ptr
 // CHECK:    %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
 // CHECK:    omp.parallel   {
-// CHECK:      omp.wsloop   reduction(@[[EQV_REDUCTION]] -> %[[RED_ACCUMULATOR]] : !llvm.ptr) for
+// CHECK:      omp.wsloop   reduction(@[[EQV_REDUCTION]] %[[RED_ACCUMULATOR]] -> %[[PRV:.+]] : !llvm.ptr) for
 // CHECK:        %[[ARRAY_ELEM_REF:.*]] = llvm.getelementptr %[[ARRAY_REF]][0, %{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr
 // CHECK:        %[[ARRAY_ELEM:.*]] = llvm.load %[[ARRAY_ELEM_REF]] : !llvm.ptr -> i32
-// CHECK:        omp.reduction %[[ARRAY_ELEM]], %[[RED_ACCUMULATOR]] : i32, !llvm.ptr
+// CHECK:        %[[LPRV:.+]] = llvm.load %[[PRV]] : !llvm.ptr -> i32
+// CHECK:        %[[ZERO_1:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK:        %[[ARGVAL_1:.*]] = llvm.icmp "ne" %[[LPRV]], %[[ZERO_1]] : i32
+// CHECK:        %[[ZERO_2:.*]] = llvm.mlir.constant(0 : i64) : i32
+// CHECK:        %[[ARGVAL_2:.*]] = llvm.icmp "ne" %[[ARRAY_ELEM]], %[[ZERO_2]] : i32
+// CHECK:        %[[RES:.*]] = llvm.icmp "eq" %[[ARGVAL_2]], %[[ARGVAL_1]] : i1
+// CHECK:        %[[RES_EXT:.*]] = llvm.zext %[[RES]] : i1 to i32
+// CHECK:        llvm.store %[[RES_EXT]], %[[PRV]] : i32, !llvm.ptr
 // CHECK:        omp.yield
 // CHECK:      omp.terminator
 // CHECK:    llvm.return
@@ -733,7 +740,7 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
     %c1_i32 = arith.constant 1 : i32
     %c100_i32 = arith.constant 100 : i32
     %c1_i32_0 = arith.constant 1 : i32
-    omp.wsloop   reduction(@eqv_reduction -> %1 : !fir.ref<!fir.logical<4>>) for  (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
+    omp.wsloop   reduction(@eqv_reduction %1 -> %prv : !fir.ref<!fir.logical<4>>) for  (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32_0) {
       fir.store %arg1 to %3 : !fir.ref<i32>
       %4 = fir.load %3 : !fir.ref<i32>
       %5 = fir.convert %4 : (i32) -> i64
@@ -741,7 +748,12 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
       %6 = arith.subi %5, %c1_i64 : i64
       %7 = fir.coordinate_of %arg0, %6 : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
       %8 = fir.load %7 : !fir.ref<!fir.logical<4>>
-      omp.reduction %8, %1 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+      %lprv = fir.load %prv : !fir.ref<!fir.logical<4>>
+      %lprv1 = fir.convert %lprv : (!fir.logical<4>) -> i1
+      %9 = fir.convert %8 : (!fir.logical<4>) -> i1
+      %10 = arith.cmpi eq, %9, %lprv1 : i1
+      %11 = fir.convert %10 : (i1) -> !fir.logical<4>
+      fir.store %11 to %prv : !fir.ref<!fir.logical<4>>
       omp.yield
     }
     omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
index 62d9af31588e9..5664529416fe8 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
@@ -1,66 +1,79 @@
 ! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
 ! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F64_NAME:.*]] : f64 init {
-!CHECK: ^bb0(%{{.*}}: f64):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
-!CHECK:  omp.yield(%[[C0_1]] : f64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f64, %[[ARG1:.*]]: f64):
-!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f64
-!CHECK:  omp.yield(%[[RES]] : f64)
-!CHECK: }
+! The script is designed to make adding checks to
+! a test case fast, it is *not* designed to be authoritative
+! about what constitutes a good test! The CHECK should be
+! minimized and named to reflect the test intent.
 
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I64_NAME:.*]] : i64 init {
-!CHECK: ^bb0(%{{.*}}: i64):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i64
-!CHECK:  omp.yield(%[[C0_1]] : i64)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64):
-!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i64
-!CHECK:  omp.yield(%[[RES]] : i64)
-!CHECK: }
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_64 : f64 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: f64):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           omp.yield(%[[VAL_1]] : f64)
 
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
-!CHECK: ^bb0(%{{.*}}: f32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  omp.yield(%[[C0_1]] : f32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
-!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
-!CHECK:  omp.yield(%[[RES]] : f32)
-!CHECK: }
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: f64, %[[VAL_1:.*]]: f64):
+! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f64
+! CHECK:           omp.yield(%[[VAL_2]] : f64)
+! CHECK:         }
 
-!CHECK-LABEL: omp.reduction.declare
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
-!CHECK: ^bb0(%{{.*}}: i32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK:  omp.yield(%[[C0_1]] : i32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
-!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
-!CHECK:  omp.yield(%[[RES]] : i32)
-!CHECK: }
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_64 : i64 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i64):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
+! CHECK:           omp.yield(%[[VAL_1]] : i64)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i64, %[[VAL_1:.*]]: i64):
+! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+! CHECK:           omp.yield(%[[VAL_2]] : i64)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_32 : f32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: f32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           omp.yield(%[[VAL_1]] : f32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32):
+! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f32
+! CHECK:           omp.yield(%[[VAL_2]] : f32)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32 : i32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK:           omp.yield(%[[VAL_2]] : i32)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
 
-!CHECK-LABEL: func.func @_QPsimple_int_reduction
-!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-!CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
-!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      omp.yield
-!CHECK:    omp.terminator
-!CHECK:  return
 subroutine simple_int_reduction
   integer :: x
   x = 0
@@ -73,23 +86,31 @@ subroutine simple_int_reduction
   !$omp end parallel
 end subroutine
 
-!CHECK-LABEL: func.func @_QPsimple_real_reduction
-!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-!CHECK:  %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
-!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
-!CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
-!CHECK:      omp.yield
-!CHECK:    omp.terminator
-!CHECK:  return
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop reduction(@add_reduction_f_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
+! CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
 subroutine simple_real_reduction
   real :: x
   x = 0.0
@@ -102,22 +123,29 @@ subroutine simple_real_reduction
   !$omp end parallel
 end subroutine
 
-!CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order
-!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-!CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
-!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
-!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      omp.yield
-!CHECK:    omp.terminator
-!CHECK:  return
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop reduction(@add_reduction_i_32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/80019