[llvm-branch-commits] [flang] [Flang][OpenMP] Move loop privatization out of dispatch (PR #106066)

Thu Aug 29 02:45:28 PDT 2024

https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/106066

>From c28e6de6fca8fe9142045dc81d5aa48e821524df Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof at amd.com>
Date: Mon, 26 Aug 2024 13:29:52 +0100
Subject: [PATCH] [Flang][OpenMP] Move loop privatization out of dispatch

This patch moves the creation of `DataSharingProcessor` instances for loop
constructs out of `genOMPDispatch()` and into their corresponding codegen
functions. This is a necessary first step to enable a proper handling of
privatization on composite constructs.

Some tests are updated due to a change of order between clause processing and
privatization.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 142 +++++++++++-------
 .../test/Lower/OpenMP/parallel-reduction3.f90 |  14 +-
 .../wsloop-reduction-array-assumed-shape.f90  |  14 +-
 .../Lower/OpenMP/wsloop-reduction-array.f90   |  18 +--
 .../Lower/OpenMP/wsloop-reduction-array2.f90  |  18 +--
 .../wsloop-reduction-multiple-clauses.f90     |  22 +--
 6 files changed, 127 insertions(+), 101 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0f7eafdc9a4736..3a48b370e6e150 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1044,7 +1044,6 @@ static void genDistributeClauses(lower::AbstractConverter &converter,
   cp.processAllocate(clauseOps);
   cp.processDistSchedule(stmtCtx, clauseOps);
   cp.processOrder(clauseOps);
-  // TODO Support delayed privatization.
 }
 
 static void genFlushClauses(lower::AbstractConverter &converter,
@@ -1128,7 +1127,6 @@ static void genSimdClauses(lower::AbstractConverter &converter,
   cp.processSafelen(clauseOps);
   cp.processSimdlen(clauseOps);
 
-  // TODO Support delayed privatization.
   cp.processTODO<clause::Linear, clause::Nontemporal>(
       loc, llvm::omp::Directive::OMPD_simd);
 }
@@ -1299,7 +1297,6 @@ static void genWsloopClauses(
   cp.processOrdered(clauseOps);
   cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
   cp.processSchedule(stmtCtx, clauseOps);
-  // TODO Support delayed privatization.
 
   cp.processTODO<clause::Allocate, clause::Linear>(
       loc, llvm::omp::Directive::OMPD_do);
@@ -1924,17 +1921,25 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
 // also be a leaf of a composite construct
 //===----------------------------------------------------------------------===//
 
-static void genStandaloneDistribute(
-    lower::AbstractConverter &converter, lower::SymMap &symTable,
-    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+static void genStandaloneDistribute(lower::AbstractConverter &converter,
+                                    lower::SymMap &symTable,
+                                    semantics::SemanticsContext &semaCtx,
+                                    lower::pft::Evaluation &eval,
+                                    mlir::Location loc,
+                                    const ConstructQueue &queue,
+                                    ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
 
   mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
+  // TODO: Support delayed privatization.
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, &symTable);
+  dsp.processStep1();
+
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
@@ -1955,8 +1960,7 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
                             semantics::SemanticsContext &semaCtx,
                             lower::pft::Evaluation &eval, mlir::Location loc,
                             const ConstructQueue &queue,
-                            ConstructQueue::const_iterator item,
-                            DataSharingProcessor &dsp) {
+                            ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
 
   mlir::omp::WsloopOperands wsloopClauseOps;
@@ -1965,6 +1969,12 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, reductionTypes, reductionSyms);
 
+  // TODO: Support delayed privatization.
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, &symTable);
+  dsp.processStep1();
+
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
@@ -2004,11 +2014,16 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                               semantics::SemanticsContext &semaCtx,
                               lower::pft::Evaluation &eval, mlir::Location loc,
                               const ConstructQueue &queue,
-                              ConstructQueue::const_iterator item,
-                              DataSharingProcessor &dsp) {
+                              ConstructQueue::const_iterator item) {
   mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
+  // TODO: Support delayed privatization.
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, &symTable);
+  dsp.processStep1();
+
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
@@ -2024,11 +2039,13 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                 llvm::omp::Directive::OMPD_simd, dsp);
 }
 
-static void genStandaloneTaskloop(
-    lower::AbstractConverter &converter, lower::SymMap &symTable,
-    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+static void genStandaloneTaskloop(lower::AbstractConverter &converter,
+                                  lower::SymMap &symTable,
+                                  semantics::SemanticsContext &semaCtx,
+                                  lower::pft::Evaluation &eval,
+                                  mlir::Location loc,
+                                  const ConstructQueue &queue,
+                                  ConstructQueue::const_iterator item) {
   TODO(loc, "Taskloop construct");
 }
 
@@ -2040,7 +2057,7 @@ static void genCompositeDistributeParallelDo(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+    ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 3 && "Invalid leaf constructs");
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO");
 }
@@ -2049,16 +2066,18 @@ static void genCompositeDistributeParallelDoSimd(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+    ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 4 && "Invalid leaf constructs");
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD");
 }
 
-static void genCompositeDistributeSimd(
-    lower::AbstractConverter &converter, lower::SymMap &symTable,
-    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
+                                       lower::SymMap &symTable,
+                                       semantics::SemanticsContext &semaCtx,
+                                       lower::pft::Evaluation &eval,
+                                       mlir::Location loc,
+                                       const ConstructQueue &queue,
+                                       ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
 
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
@@ -2073,6 +2092,12 @@ static void genCompositeDistributeSimd(
   mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps);
 
+  // TODO: Support delayed privatization.
+  DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, &symTable);
+  dsp.processStep1();
+
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -2109,8 +2134,7 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
                                semantics::SemanticsContext &semaCtx,
                                lower::pft::Evaluation &eval, mlir::Location loc,
                                const ConstructQueue &queue,
-                               ConstructQueue::const_iterator item,
-                               DataSharingProcessor &dsp) {
+                               ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
 
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
@@ -2127,6 +2151,12 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps);
 
+  // TODO: Support delayed privatization.
+  DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, &symTable);
+  dsp.processStep1();
+
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -2157,11 +2187,13 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
                 llvm::omp::Directive::OMPD_do_simd, dsp);
 }
 
-static void genCompositeTaskloopSimd(
-    lower::AbstractConverter &converter, lower::SymMap &symTable,
-    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+static void genCompositeTaskloopSimd(lower::AbstractConverter &converter,
+                                     lower::SymMap &symTable,
+                                     semantics::SemanticsContext &semaCtx,
+                                     lower::pft::Evaluation &eval,
+                                     mlir::Location loc,
+                                     const ConstructQueue &queue,
+                                     ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
   TODO(loc, "Composite TASKLOOP SIMD");
 }
@@ -2170,30 +2202,35 @@ static void genCompositeTaskloopSimd(
 // Dispatch
 //===----------------------------------------------------------------------===//
 
-static bool genOMPCompositeDispatch(
-    lower::AbstractConverter &converter, lower::SymMap &symTable,
-    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::Location loc, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
+static bool genOMPCompositeDispatch(lower::AbstractConverter &converter,
+                                    lower::SymMap &symTable,
+                                    semantics::SemanticsContext &semaCtx,
+                                    lower::pft::Evaluation &eval,
+                                    mlir::Location loc,
+                                    const ConstructQueue &queue,
+                                    ConstructQueue::const_iterator item) {
   using llvm::omp::Directive;
   using lower::omp::matchLeafSequence;
 
+  // TODO: Privatization for composite constructs is currently only done based
+  // on the clauses for their last leaf construct, which may not always be
+  // correct. Consider per-leaf privatization of composite constructs once
+  // delayed privatization is supported by all participating ops.
   if (matchLeafSequence(item, queue, Directive::OMPD_distribute_parallel_do))
     genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval, loc,
-                                     queue, item, dsp);
+                                     queue, item);
   else if (matchLeafSequence(item, queue,
                              Directive::OMPD_distribute_parallel_do_simd))
     genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
-                                         loc, queue, item, dsp);
+                                         loc, queue, item);
   else if (matchLeafSequence(item, queue, Directive::OMPD_distribute_simd))
     genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, queue,
-                               item, dsp);
+                               item);
   else if (matchLeafSequence(item, queue, Directive::OMPD_do_simd))
-    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, queue, item,
-                       dsp);
+    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, queue, item);
   else if (matchLeafSequence(item, queue, Directive::OMPD_taskloop_simd))
     genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, queue,
-                             item, dsp);
+                             item);
   else
     return false;
 
@@ -2208,20 +2245,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
                            ConstructQueue::const_iterator item) {
   assert(item != queue.end());
 
-  std::optional<DataSharingProcessor> loopDsp;
   bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
                   llvm::omp::Association::Loop;
   if (loopLeaf) {
     symTable.pushScope();
-    // TODO: Use one DataSharingProcessor for each leaf of a composite
-    // construct.
-    loopDsp.emplace(converter, semaCtx, item->clauses, eval,
-                    /*shouldCollectPreDeterminedSymbols=*/true,
-                    /*useDelayedPrivatization=*/false, &symTable);
-    loopDsp->processStep1();
-
     if (genOMPCompositeDispatch(converter, symTable, semaCtx, eval, loc, queue,
-                                item, *loopDsp)) {
+                                item)) {
       symTable.popScope();
       return;
     }
@@ -2233,11 +2262,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_distribute:
     genStandaloneDistribute(converter, symTable, semaCtx, eval, loc, queue,
-                            item, *loopDsp);
+                            item);
     break;
   case llvm::omp::Directive::OMPD_do:
-    genStandaloneDo(converter, symTable, semaCtx, eval, loc, queue, item,
-                    *loopDsp);
+    genStandaloneDo(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_loop:
     TODO(loc, "Unhandled directive " + llvm::omp::getOpenMPDirectiveName(dir));
@@ -2266,8 +2294,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     // in genBodyOfOp
     break;
   case llvm::omp::Directive::OMPD_simd:
-    genStandaloneSimd(converter, symTable, semaCtx, eval, loc, queue, item,
-                      *loopDsp);
+    genStandaloneSimd(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
@@ -2297,8 +2324,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     genTaskgroupOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskloop:
-    genStandaloneTaskloop(converter, symTable, semaCtx, eval, loc, queue, item,
-                          *loopDsp);
+    genStandaloneTaskloop(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
     genTaskwaitOp(converter, symTable, semaCtx, eval, loc, queue, item);
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 441dff34553d4f..591f41cb946602 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -69,19 +69,19 @@
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 : i32, !fir.box<!fir.array<?xi32>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[VAL_16:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-! CHECK:             fir.store %[[VAL_12]]#0 to %[[VAL_16]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:             %[[VAL_14:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! CHECK:             fir.store %[[VAL_12]]#0 to %[[VAL_14]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:             %[[VAL_15:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_19:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxi32 %[[VAL_16]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxi32 %[[VAL_14]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_21:.*]]) : i32 = (%[[VAL_17]]) to (%[[VAL_18]]) inclusive step (%[[VAL_19]]) {
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFsEc"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_16]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
-! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_25:.*]] = arith.constant 0 : index
 ! CHECK:                 %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_25]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:                 %[[VAL_27:.*]] = fir.shape %[[VAL_26]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index c984ab61bedb3b..d881ff8c1a026a 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -79,18 +79,18 @@ subroutine reduce(r)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
-! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxf64 %[[VAL_6]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxf64 %[[VAL_4]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> (!fir.ref<!fir.box<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.array<?xf64>>>)
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_5]]#1 : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f64
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.array<?xf64>>>
 ! CHECK:                 %[[VAL_16:.*]] = arith.constant 1 : index
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index 43e4c86b6bade2..afdd486c7be2a4 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -73,24 +73,24 @@ program reduce
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) {uniq_name = "_QFEr"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-! CHECK:             %[[VAL_9:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
-! CHECK:             fir.store %[[VAL_8]] to %[[VAL_9]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:             fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_9]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
 ! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_9]]#1 : !fir.ref<i32>
+! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:                 %[[VAL_18:.*]] = arith.constant 1 : index
 ! CHECK:                 %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
 ! CHECK:                 hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_21:.*]] = arith.constant 0 : i32
 ! CHECK:                 %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_20]] : i32
 ! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index be5273ea36c99f..ee77332e9b412a 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -73,23 +73,23 @@ program reduce
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) {uniq_name = "_QFEr"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-! CHECK:             %[[VAL_9:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
-! CHECK:             fir.store %[[VAL_8]] to %[[VAL_9]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:             fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_9]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
 ! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
+! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_9]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:                 %[[VAL_17:.*]] = arith.constant 1 : index
 ! CHECK:                 %[[VAL_18:.*]] = hlfir.designate %[[VAL_16]] (%[[VAL_17]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
 ! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_21:.*]] = arith.addi %[[VAL_19]], %[[VAL_20]] : i32
 ! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:                 %[[VAL_23:.*]] = arith.constant 1 : index
@@ -99,7 +99,7 @@ program reduce
 ! CHECK:                 %[[VAL_26:.*]] = arith.constant 2 : index
 ! CHECK:                 %[[VAL_27:.*]] = hlfir.designate %[[VAL_25]] (%[[VAL_26]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
 ! CHECK:                 %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_30:.*]] = arith.subi %[[VAL_28]], %[[VAL_29]] : i32
 ! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:                 %[[VAL_32:.*]] = arith.constant 2 : index
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index db4b4d33da7579..3e93e915bcd89c 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -109,23 +109,23 @@ program main
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f64
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : f64, !fir.ref<!fir.array<3x3xf64>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[VAL_13:.*]] = fir.embox %[[VAL_4]]#0(%[[VAL_3]]) : (!fir.ref<!fir.array<3x3xf64>>, !fir.shape<2>) -> !fir.box<!fir.array<3x3xf64>>
-! CHECK:             %[[VAL_14:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>>
-! CHECK:             fir.store %[[VAL_13]] to %[[VAL_14]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_4]]#0(%[[VAL_3]]) : (!fir.ref<!fir.array<3x3xf64>>, !fir.shape<2>) -> !fir.box<!fir.array<3x3xf64>>
+! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>>
+! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>
+! CHECK:             %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
+! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]] : !fir.ref<f64>, byref @add_reduction_byref_box_3x3xf64 %[[VAL_14]] -> %[[VAL_19:.*]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
+! CHECK:             omp.wsloop reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]] : !fir.ref<f64>, byref @add_reduction_byref_box_3x3xf64 %[[VAL_12]] -> %[[VAL_19:.*]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_20:.*]]) : i32 = (%[[VAL_15]]) to (%[[VAL_16]]) inclusive step (%[[VAL_17]]) {
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEscalar"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFEarray"} : (!fir.ref<!fir.box<!fir.array<3x3xf64>>>) -> (!fir.ref<!fir.box<!fir.array<3x3xf64>>>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>)
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_12]]#1 : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_14]]#1 : !fir.ref<i32>
+! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i32) -> f64
 ! CHECK:                 hlfir.assign %[[VAL_24]] to %[[VAL_21]]#0 : f64, !fir.ref<f64>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_26:.*]] = arith.constant 1 : i32
 ! CHECK:                 %[[VAL_27:.*]] = arith.addi %[[VAL_25]], %[[VAL_26]] : i32
 ! CHECK:                 %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i32) -> f64
@@ -134,7 +134,7 @@ program main
 ! CHECK:                 %[[VAL_31:.*]] = arith.constant 1 : index
 ! CHECK:                 %[[VAL_32:.*]] = hlfir.designate %[[VAL_29]] (%[[VAL_30]], %[[VAL_31]])  : (!fir.box<!fir.array<3x3xf64>>, index, index) -> !fir.ref<f64>
 ! CHECK:                 hlfir.assign %[[VAL_28]] to %[[VAL_32]] : f64, !fir.ref<f64>
-! CHECK:                 %[[VAL_33:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_33:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_34:.*]] = arith.constant 2 : i32
 ! CHECK:                 %[[VAL_35:.*]] = arith.addi %[[VAL_33]], %[[VAL_34]] : i32
 ! CHECK:                 %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (i32) -> f64
@@ -143,7 +143,7 @@ program main
 ! CHECK:                 %[[VAL_39:.*]] = arith.constant 2 : index
 ! CHECK:                 %[[VAL_40:.*]] = hlfir.designate %[[VAL_37]] (%[[VAL_38]], %[[VAL_39]])  : (!fir.box<!fir.array<3x3xf64>>, index, index) -> !fir.ref<f64>
 ! CHECK:                 hlfir.assign %[[VAL_36]] to %[[VAL_40]] : f64, !fir.ref<f64>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_42:.*]] = arith.constant 3 : i32
 ! CHECK:                 %[[VAL_43:.*]] = arith.addi %[[VAL_41]], %[[VAL_42]] : i32
 ! CHECK:                 %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i32) -> f64