[flang-commits] [flang] [flang][openmp] Fix incorrect reduction for array section in OpenMP DO SIMD (PR #192394)
via flang-commits
flang-commits at lists.llvm.org
Fri Apr 24 03:08:36 PDT 2026
https://github.com/SunilKuravinakop updated https://github.com/llvm/llvm-project/pull/192394
>From c332054af4e8e9bf47e68432d9303eb7e0993860 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 15 Apr 2026 14:52:30 -0500
Subject: [PATCH 1/2] for "!omp do parallel simd reduction" ensuring that
reduction is done properly by : 1) per-SIMD-lane reduction results are
combined into the wsloop's thread-local copies. 2) wsloop thread-local
copies are combined across threads by the wsloop reduction.
---
flang/lib/Lower/OpenMP/OpenMP.cpp | 16 ++++++++++++++++
flang/test/Lower/OpenMP/wsloop-simd.f90 | 19 +++++++++++++++++++
2 files changed, 35 insertions(+)
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 33de565eda275..bc6208624db1a 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3601,6 +3601,22 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
converter, loc, wsloopClauseOps, wsloopArgs);
wsloopOp.setComposite(/*val=*/true);
+ // For composite DO SIMD, the simd reduction vars must reference the
+ // wsloop's reduction block args (thread-private copies) rather than the
+ // original variables. This ensures that
+ // 1) per-SIMD-lane reduction results are combined into the wsloop's
+ // thread-local copies
+ // 2) wsloop thread-local copies are combined across
+ // threads by the wsloop reduction.
+ auto wsloopBlockArgIface =
+ llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*wsloopOp);
+ for (unsigned i = 0; i < simdClauseOps.reductionVars.size() &&
+ i < wsloopBlockArgIface.numReductionBlockArgs();
+ ++i) {
+ simdClauseOps.reductionVars[i] =
+ wsloopBlockArgIface.getReductionBlockArgs()[i];
+ }
+
EntryBlockArgs simdArgs;
simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
simdArgs.priv.vars = simdClauseOps.privateVars;
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..987bc804a9b44 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -67,6 +67,25 @@ subroutine do_simd_reduction()
!$omp end do simd
end subroutine do_simd_reduction
+! Verify that the simd reduction var references the wsloop's reduction
+! block arg (not the original array), ensuring proper chaining of
+! per-SIMD-lane results into the wsloop's thread-private reduction copy.
+! CHECK-LABEL: {{.*}}do_simd_array_reduction{{.*}}
+subroutine do_simd_array_reduction()
+ integer :: a(100)
+ a = 0
+ ! CHECK: omp.wsloop
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM:.*]] %{{.*}} -> %[[ADD_RED_OUTER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.simd
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM]] %[[ADD_RED_OUTER]] -> %[[ADD_RED_INNER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.loop_nest
+ !$omp do simd reduction(+:a)
+ do index_ = 1, 10
+ a = a + index_
+ end do
+ !$omp end do simd
+end subroutine do_simd_array_reduction
+
! CHECK-LABEL: func.func @_QPdo_simd_private(
subroutine do_simd_private()
integer, allocatable :: tmp
>From ae5df1a10a7dafb75daa43740750308d7de8538f Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Fri, 24 Apr 2026 04:34:19 -0500
Subject: [PATCH 2/2] Moving the fix into: genWsloopClauses() &
genSimdClauses() for Composite Simd within do loop. The fix is when arrays
are used not for scalar.
---
.../flang/Lower/Support/ReductionProcessor.h | 10 +++-
flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 6 ++-
flang/lib/Lower/OpenMP/ClauseProcessor.h | 4 +-
flang/lib/Lower/OpenMP/OpenMP.cpp | 46 +++++++++----------
.../lib/Lower/Support/ReductionProcessor.cpp | 31 +++++++++++--
5 files changed, 65 insertions(+), 32 deletions(-)
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index bd0447360f089..2ab6c97cde73b 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -141,6 +141,12 @@ class ReductionProcessor {
/// Creates a reduction declaration and associates it with an OpenMP block
/// directive.
+ /// \param [in,out] reductionVarCache - optional cache mapping reduction
+ /// symbols to their SSA values. When provided, array/box reduction
+ /// variables that have already been allocated will be reused instead of
+ /// creating new allocas. This ensures that nested composite wrappers
+ /// (e.g. wsloop and simd in DO SIMD) share the same SSA values, allowing
+ /// the genLoopVars() mapper to correctly remap inner wrapper operands.
template <typename OpType, typename RedOperatorListTy>
static bool processReductionArguments(
mlir::Location currentLocation, lower::AbstractConverter &converter,
@@ -148,7 +154,9 @@ class ReductionProcessor {
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr);
};
template <typename FloatOp, typename IntegerOp>
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 45b11c818245e..a29915d6bcb34 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -2002,7 +2002,9 @@ bool ClauseProcessor::processNontemporal(
bool ClauseProcessor::processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache)
+ const {
return findRepeatableClause<omp::clause::Reduction>(
[&](const omp::clause::Reduction &clause, const parser::CharBlock &) {
llvm::SmallVector<mlir::Value> reductionVars;
@@ -2026,7 +2028,7 @@ bool ClauseProcessor::processReduction(
currentLocation, converter,
std::get<typename omp::clause::ReductionOperatorList>(clause.t),
reductionVars, reduceVarByRef, reductionDeclSymbols,
- reductionSyms))
+ reductionSyms, reductionVarCache))
TODO(currentLocation, "Lowering unrecognised reduction type");
// Copy local lists into the output.
llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index f343ee8ff4332..f6ee9139381e5 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -163,7 +163,9 @@ class ClauseProcessor {
bool processNontemporal(mlir::omp::NontemporalClauseOps &result) const;
bool processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) const;
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr) const;
bool processTaskReduction(
mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index bc6208624db1a..28d5264d73627 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1648,13 +1648,15 @@ static void genSimdClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
const List<Clause> &clauses, mlir::Location loc,
mlir::omp::SimdOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAligned(clauseOps);
cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
cp.processNontemporal(clauseOps);
cp.processOrder(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSafelen(clauseOps);
cp.processSimdlen(clauseOps);
cp.processLinear(clauseOps);
@@ -1913,13 +1915,15 @@ static void genWsloopClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::StatementContext &stmtCtx, const List<Clause> &clauses,
mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAllocate(clauseOps);
cp.processNowait(clauseOps);
cp.processOrder(clauseOps);
cp.processOrdered(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSchedule(stmtCtx, clauseOps);
cp.processLinear(clauseOps);
}
@@ -3428,6 +3432,10 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
/*isComposite=*/true);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables. See genCompositeDoSimd.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, distributeItem->clauses,
loc, distributeClauseOps);
@@ -3435,12 +3443,12 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
@@ -3561,15 +3569,21 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
ConstructQueue::const_iterator simdItem = std::next(doItem);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables, enabling genLoopVars()'s
+ // IRMapping to correctly chain the inner wrapper's operands to the outer
+ // wrapper's block arguments.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor wsloopItemDSP(
converter, semaCtx, doItem->clauses, eval,
@@ -3601,22 +3615,6 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
converter, loc, wsloopClauseOps, wsloopArgs);
wsloopOp.setComposite(/*val=*/true);
- // For composite DO SIMD, the simd reduction vars must reference the
- // wsloop's reduction block args (thread-private copies) rather than the
- // original variables. This ensures that
- // 1) per-SIMD-lane reduction results are combined into the wsloop's
- // thread-local copies
- // 2) wsloop thread-local copies are combined across
- // threads by the wsloop reduction.
- auto wsloopBlockArgIface =
- llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*wsloopOp);
- for (unsigned i = 0; i < simdClauseOps.reductionVars.size() &&
- i < wsloopBlockArgIface.numReductionBlockArgs();
- ++i) {
- simdClauseOps.reductionVars[i] =
- wsloopBlockArgIface.getReductionBlockArgs()[i];
- }
-
EntryBlockArgs simdArgs;
simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
simdArgs.priv.vars = simdClauseOps.privateVars;
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 04efba463de49..d77f4358a2383 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -46,7 +46,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template bool ReductionProcessor::processReductionArguments<
fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
@@ -55,7 +56,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template mlir::omp::DeclareReductionOp
ReductionProcessor::createDeclareReduction<mlir::omp::DeclareReductionOp>(
@@ -653,7 +655,8 @@ bool ReductionProcessor::processReductionArguments(
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
if constexpr (std::is_same_v<RedOperatorListTy,
@@ -696,6 +699,21 @@ bool ReductionProcessor::processReductionArguments(
}
for (const semantics::Symbol *symbol : reductionSymbols) {
+ // If a cached reduction variable exists for this symbol, reuse it.
+ // This ensures that composite constructs (e.g. DO SIMD) where both
+ // the outer wrapper (wsloop) and inner wrapper (simd) process the same
+ // reduction clause share the same SSA value, enabling genLoopVars()'s
+ // IRMapping to correctly remap inner wrapper operands to outer wrapper
+ // block arguments.
+ if (reductionVarCache) {
+ auto it = reductionVarCache->find(symbol);
+ if (it != reductionVarCache->end()) {
+ reductionVars.push_back(it->second);
+ reduceVarByRef.push_back(doReductionByRef(it->second));
+ continue;
+ }
+ }
+
mlir::Value symVal = converter.getSymbolAddress(*symbol);
if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
@@ -748,7 +766,12 @@ bool ReductionProcessor::processReductionArguments(
reductionVars.push_back(
builder.createConvert(currentLocation, refTy, symVal));
- reduceVarByRef.push_back(doReductionByRef(symVal));
+ reduceVarByRef.push_back(doReductionByRef(reductionVars.back()));
+
+ // Cache the final SSA value for this symbol so that subsequent calls
+ // (e.g. for the inner wrapper in a composite construct) reuse it.
+ if (reductionVarCache)
+ reductionVarCache->try_emplace(symbol, reductionVars.back());
}
unsigned idx = 0;
More information about the flang-commits
mailing list