[flang-commits] [flang] da66e6d - [flang][openmp] Fix incorrect reduction for array section in OpenMP DO SIMD (#192394)

Thu Apr 30 08:36:33 PDT 2026

Author: SunilKuravinakop
Date: 2026-04-30T21:06:28+05:30
New Revision: da66e6d0d5a9e26f689ba797f28345996a032e44

URL: https://github.com/llvm/llvm-project/commit/da66e6d0d5a9e26f689ba797f28345996a032e44
DIFF: https://github.com/llvm/llvm-project/commit/da66e6d0d5a9e26f689ba797f28345996a032e44.diff

LOG: [flang][openmp] Fix incorrect reduction for array section in OpenMP DO SIMD (#192394)

for "!omp do parallel simd reduction" ensuring that reduction for array
section is done properly by :
1) per-SIMD-lane reduction results are combined into the wsloop's
   thread-local copies.
2) wsloop thread-local copies are combined across threads by the wsloop
   reduction.
   
Issue is in [192077](https://github.com/llvm/llvm-project/issues/192077)

---------

Co-authored-by: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>

Added: 
    

Modified: 
    flang/include/flang/Lower/Support/ReductionProcessor.h
    flang/lib/Lower/OpenMP/ClauseProcessor.cpp
    flang/lib/Lower/OpenMP/ClauseProcessor.h
    flang/lib/Lower/OpenMP/OpenMP.cpp
    flang/lib/Lower/Support/ReductionProcessor.cpp
    flang/test/Lower/OpenMP/wsloop-simd.f90

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index bbc4879bbe352..0b4a692827a79 100644

--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -144,6 +144,12 @@ class ReductionProcessor {
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
+  /// \param [in,out] reductionVarCache - optional cache mapping reduction
+  ///   symbols to their SSA values. When provided, array/box reduction
+  ///   variables that have already been allocated will be reused instead of
+  ///   creating new allocas. This ensures that nested composite wrappers
+  ///   (e.g. wsloop and simd in DO SIMD) share the same SSA values, allowing
+  ///   the genLoopVars() mapper to correctly remap inner wrapper operands.
   template <typename OpType, typename RedOperatorListTy>
   static bool processReductionArguments(
       mlir::Location currentLocation, lower::AbstractConverter &converter,
@@ -151,7 +157,9 @@ class ReductionProcessor {
       llvm::SmallVectorImpl<mlir::Value> &reductionVars,
       llvm::SmallVectorImpl<bool> &reduceVarByRef,
       llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-      const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+      const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+      llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+          *reductionVarCache = nullptr);
 };
 
 template <typename FloatOp, typename IntegerOp>

diff  --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 796dda34e3821..1c39e90a922cf 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -2023,7 +2023,9 @@ bool ClauseProcessor::processNontemporal(
 
 bool ClauseProcessor::processReduction(
     mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+    llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache)
+    const {
   return findRepeatableClause<omp::clause::Reduction>(
       [&](const omp::clause::Reduction &clause, const parser::CharBlock &) {
         llvm::SmallVector<mlir::Value> reductionVars;
@@ -2047,7 +2049,7 @@ bool ClauseProcessor::processReduction(
                 currentLocation, converter,
                 std::get<typename omp::clause::ReductionOperatorList>(clause.t),
                 reductionVars, reduceVarByRef, reductionDeclSymbols,
-                reductionSyms))
+                reductionSyms, reductionVarCache))
           TODO(currentLocation, "Lowering unrecognised reduction type");
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));

diff  --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 29b5c29b8e33a..acf1068efb987 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -166,7 +166,9 @@ class ClauseProcessor {
   bool processNontemporal(mlir::omp::NontemporalClauseOps &result) const;
   bool processReduction(
       mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
-      llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) const;
+      llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+      llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+          *reductionVarCache = nullptr) const;
   bool processTaskReduction(
       mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
       llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;

diff  --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6859f5b291342..88d28cf94b045 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1667,13 +1667,15 @@ static void genSimdClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     const List<Clause> &clauses, mlir::Location loc,
     mlir::omp::SimdOperands &clauseOps,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+        nullptr) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAligned(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
   cp.processNontemporal(clauseOps);
   cp.processOrder(clauseOps);
-  cp.processReduction(loc, clauseOps, reductionSyms);
+  cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
   cp.processSafelen(clauseOps);
   cp.processSimdlen(clauseOps);
   cp.processLinear(clauseOps);
@@ -1943,13 +1945,15 @@ static void genWsloopClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+        nullptr) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processNowait(clauseOps);
   cp.processOrder(clauseOps);
   cp.processOrdered(clauseOps);
-  cp.processReduction(loc, clauseOps, reductionSyms);
+  cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
   cp.processSchedule(stmtCtx, clauseOps);
   cp.processLinear(clauseOps);
 }
@@ -3505,6 +3509,10 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
                 /*isComposite=*/true);
 
   // Clause processing.
+  // Use a shared cache so that both wsloop and simd produce the same SSA
+  // values for array/box reduction variables. See genCompositeDoSimd.
+  llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
   mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, distributeItem->clauses,
                        loc, distributeClauseOps);
@@ -3512,12 +3520,12 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
   genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
-                   wsloopClauseOps, wsloopReductionSyms);
+                   wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
 
   mlir::omp::SimdOperands simdClauseOps;
   llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
-                 simdReductionSyms);
+                 simdReductionSyms, &reductionVarCache);
 
   DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
                                    /*shouldCollectPreDeterminedSymbols=*/true,
@@ -3638,15 +3646,21 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   ConstructQueue::const_iterator simdItem = std::next(doItem);
 
   // Clause processing.
+  // Use a shared cache so that both wsloop and simd produce the same SSA
+  // values for array/box reduction variables, enabling genLoopVars()'s
+  // IRMapping to correctly chain the inner wrapper's operands to the outer
+  // wrapper's block arguments.
+  llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
   genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
-                   wsloopClauseOps, wsloopReductionSyms);
+                   wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
 
   mlir::omp::SimdOperands simdClauseOps;
   llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
-                 simdReductionSyms);
+                 simdReductionSyms, &reductionVarCache);
 
   DataSharingProcessor wsloopItemDSP(
       converter, semaCtx, doItem->clauses, eval,

diff  --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 2aa040e4b0ca8..d5387f7a59118 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -46,7 +46,8 @@ template bool ReductionProcessor::processReductionArguments<
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
 
 template bool ReductionProcessor::processReductionArguments<
     fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
@@ -55,7 +56,8 @@ template bool ReductionProcessor::processReductionArguments<
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
 
 template mlir::omp::DeclareReductionOp
 ReductionProcessor::createDeclareReduction<mlir::omp::DeclareReductionOp>(
@@ -658,7 +660,8 @@ bool ReductionProcessor::processReductionArguments(
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
   if constexpr (std::is_same_v<RedOperatorListTy,
@@ -701,6 +704,21 @@ bool ReductionProcessor::processReductionArguments(
   }
 
   for (const semantics::Symbol *symbol : reductionSymbols) {
+    // If a cached reduction variable exists for this symbol, reuse it.
+    // This ensures that composite constructs (e.g. DO SIMD) where both
+    // the outer wrapper (wsloop) and inner wrapper (simd) process the same
+    // reduction clause share the same SSA value, enabling genLoopVars()'s
+    // IRMapping to correctly remap inner wrapper operands to outer wrapper
+    // block arguments.
+    if (reductionVarCache) {
+      auto it = reductionVarCache->find(symbol);
+      if (it != reductionVarCache->end()) {
+        reductionVars.push_back(it->second);
+        reduceVarByRef.push_back(doReductionByRef(it->second));
+        continue;
+      }
+    }
+
     mlir::Value symVal = converter.getSymbolAddress(*symbol);
 
     if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
@@ -753,7 +771,12 @@ bool ReductionProcessor::processReductionArguments(
 
     reductionVars.push_back(
         builder.createConvert(currentLocation, refTy, symVal));
-    reduceVarByRef.push_back(doReductionByRef(symVal));
+    reduceVarByRef.push_back(doReductionByRef(reductionVars.back()));
+
+    // Cache the final SSA value for this symbol so that subsequent calls
+    // (e.g. for the inner wrapper in a composite construct) reuse it.
+    if (reductionVarCache)
+      reductionVarCache->try_emplace(symbol, reductionVars.back());
   }
 
   unsigned idx = 0;

diff  --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..987bc804a9b44 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -67,6 +67,25 @@ subroutine do_simd_reduction()
   !$omp end do simd
 end subroutine do_simd_reduction
 
+! Verify that the simd reduction var references the wsloop's reduction
+! block arg (not the original array), ensuring proper chaining of
+! per-SIMD-lane results into the wsloop's thread-private reduction copy.
+! CHECK-LABEL: {{.*}}do_simd_array_reduction{{.*}}
+subroutine do_simd_array_reduction()
+  integer :: a(100)
+  a = 0
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM:.*]] %{{.*}} -> %[[ADD_RED_OUTER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM]] %[[ADD_RED_OUTER]] -> %[[ADD_RED_INNER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+  ! CHECK-NEXT: omp.loop_nest
+  !$omp do simd reduction(+:a)
+    do index_ = 1, 10
+      a = a + index_
+    end do
+  !$omp end do simd
+end subroutine do_simd_array_reduction
+
 ! CHECK-LABEL: func.func @_QPdo_simd_private(
 subroutine do_simd_private()
   integer, allocatable :: tmp