[flang-commits] [flang] da66e6d - [flang][openmp] Fix incorrect reduction for array section in OpenMP DO SIMD (#192394)
via flang-commits
flang-commits at lists.llvm.org
Thu Apr 30 08:36:33 PDT 2026
Author: SunilKuravinakop
Date: 2026-04-30T21:06:28+05:30
New Revision: da66e6d0d5a9e26f689ba797f28345996a032e44
URL: https://github.com/llvm/llvm-project/commit/da66e6d0d5a9e26f689ba797f28345996a032e44
DIFF: https://github.com/llvm/llvm-project/commit/da66e6d0d5a9e26f689ba797f28345996a032e44.diff
LOG: [flang][openmp] Fix incorrect reduction for array section in OpenMP DO SIMD (#192394)
for "!omp do parallel simd reduction" ensuring that reduction for array
section is done properly by :
1) per-SIMD-lane reduction results are combined into the wsloop's
thread-local copies.
2) wsloop thread-local copies are combined across threads by the wsloop
reduction.
Issue is in [192077](https://github.com/llvm/llvm-project/issues/192077)
---------
Co-authored-by: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Added:
Modified:
flang/include/flang/Lower/Support/ReductionProcessor.h
flang/lib/Lower/OpenMP/ClauseProcessor.cpp
flang/lib/Lower/OpenMP/ClauseProcessor.h
flang/lib/Lower/OpenMP/OpenMP.cpp
flang/lib/Lower/Support/ReductionProcessor.cpp
flang/test/Lower/OpenMP/wsloop-simd.f90
Removed:
################################################################################
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index bbc4879bbe352..0b4a692827a79 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -144,6 +144,12 @@ class ReductionProcessor {
/// Creates a reduction declaration and associates it with an OpenMP block
/// directive.
+ /// \param [in,out] reductionVarCache - optional cache mapping reduction
+ /// symbols to their SSA values. When provided, array/box reduction
+ /// variables that have already been allocated will be reused instead of
+ /// creating new allocas. This ensures that nested composite wrappers
+ /// (e.g. wsloop and simd in DO SIMD) share the same SSA values, allowing
+ /// the genLoopVars() mapper to correctly remap inner wrapper operands.
template <typename OpType, typename RedOperatorListTy>
static bool processReductionArguments(
mlir::Location currentLocation, lower::AbstractConverter &converter,
@@ -151,7 +157,9 @@ class ReductionProcessor {
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr);
};
template <typename FloatOp, typename IntegerOp>
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 796dda34e3821..1c39e90a922cf 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -2023,7 +2023,9 @@ bool ClauseProcessor::processNontemporal(
bool ClauseProcessor::processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache)
+ const {
return findRepeatableClause<omp::clause::Reduction>(
[&](const omp::clause::Reduction &clause, const parser::CharBlock &) {
llvm::SmallVector<mlir::Value> reductionVars;
@@ -2047,7 +2049,7 @@ bool ClauseProcessor::processReduction(
currentLocation, converter,
std::get<typename omp::clause::ReductionOperatorList>(clause.t),
reductionVars, reduceVarByRef, reductionDeclSymbols,
- reductionSyms))
+ reductionSyms, reductionVarCache))
TODO(currentLocation, "Lowering unrecognised reduction type");
// Copy local lists into the output.
llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 29b5c29b8e33a..acf1068efb987 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -166,7 +166,9 @@ class ClauseProcessor {
bool processNontemporal(mlir::omp::NontemporalClauseOps &result) const;
bool processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) const;
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr) const;
bool processTaskReduction(
mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6859f5b291342..88d28cf94b045 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1667,13 +1667,15 @@ static void genSimdClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
const List<Clause> &clauses, mlir::Location loc,
mlir::omp::SimdOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAligned(clauseOps);
cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
cp.processNontemporal(clauseOps);
cp.processOrder(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSafelen(clauseOps);
cp.processSimdlen(clauseOps);
cp.processLinear(clauseOps);
@@ -1943,13 +1945,15 @@ static void genWsloopClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::StatementContext &stmtCtx, const List<Clause> &clauses,
mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAllocate(clauseOps);
cp.processNowait(clauseOps);
cp.processOrder(clauseOps);
cp.processOrdered(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSchedule(stmtCtx, clauseOps);
cp.processLinear(clauseOps);
}
@@ -3505,6 +3509,10 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
/*isComposite=*/true);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables. See genCompositeDoSimd.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, distributeItem->clauses,
loc, distributeClauseOps);
@@ -3512,12 +3520,12 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
@@ -3638,15 +3646,21 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
ConstructQueue::const_iterator simdItem = std::next(doItem);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables, enabling genLoopVars()'s
+ // IRMapping to correctly chain the inner wrapper's operands to the outer
+ // wrapper's block arguments.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor wsloopItemDSP(
converter, semaCtx, doItem->clauses, eval,
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 2aa040e4b0ca8..d5387f7a59118 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -46,7 +46,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template bool ReductionProcessor::processReductionArguments<
fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
@@ -55,7 +56,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template mlir::omp::DeclareReductionOp
ReductionProcessor::createDeclareReduction<mlir::omp::DeclareReductionOp>(
@@ -658,7 +660,8 @@ bool ReductionProcessor::processReductionArguments(
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
if constexpr (std::is_same_v<RedOperatorListTy,
@@ -701,6 +704,21 @@ bool ReductionProcessor::processReductionArguments(
}
for (const semantics::Symbol *symbol : reductionSymbols) {
+ // If a cached reduction variable exists for this symbol, reuse it.
+ // This ensures that composite constructs (e.g. DO SIMD) where both
+ // the outer wrapper (wsloop) and inner wrapper (simd) process the same
+ // reduction clause share the same SSA value, enabling genLoopVars()'s
+ // IRMapping to correctly remap inner wrapper operands to outer wrapper
+ // block arguments.
+ if (reductionVarCache) {
+ auto it = reductionVarCache->find(symbol);
+ if (it != reductionVarCache->end()) {
+ reductionVars.push_back(it->second);
+ reduceVarByRef.push_back(doReductionByRef(it->second));
+ continue;
+ }
+ }
+
mlir::Value symVal = converter.getSymbolAddress(*symbol);
if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
@@ -753,7 +771,12 @@ bool ReductionProcessor::processReductionArguments(
reductionVars.push_back(
builder.createConvert(currentLocation, refTy, symVal));
- reduceVarByRef.push_back(doReductionByRef(symVal));
+ reduceVarByRef.push_back(doReductionByRef(reductionVars.back()));
+
+ // Cache the final SSA value for this symbol so that subsequent calls
+ // (e.g. for the inner wrapper in a composite construct) reuse it.
+ if (reductionVarCache)
+ reductionVarCache->try_emplace(symbol, reductionVars.back());
}
unsigned idx = 0;
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..987bc804a9b44 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -67,6 +67,25 @@ subroutine do_simd_reduction()
!$omp end do simd
end subroutine do_simd_reduction
+! Verify that the simd reduction var references the wsloop's reduction
+! block arg (not the original array), ensuring proper chaining of
+! per-SIMD-lane results into the wsloop's thread-private reduction copy.
+! CHECK-LABEL: {{.*}}do_simd_array_reduction{{.*}}
+subroutine do_simd_array_reduction()
+ integer :: a(100)
+ a = 0
+ ! CHECK: omp.wsloop
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM:.*]] %{{.*}} -> %[[ADD_RED_OUTER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.simd
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM]] %[[ADD_RED_OUTER]] -> %[[ADD_RED_INNER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.loop_nest
+ !$omp do simd reduction(+:a)
+ do index_ = 1, 10
+ a = a + index_
+ end do
+ !$omp end do simd
+end subroutine do_simd_array_reduction
+
! CHECK-LABEL: func.func @_QPdo_simd_private(
subroutine do_simd_private()
integer, allocatable :: tmp
More information about the flang-commits
mailing list