[flang-commits] [flang] [flang][OpenMP] `do concurrent`: support `local` on device (PR #157638)
Kareem Ergawy via flang-commits
flang-commits at lists.llvm.org
Sat Sep 20 23:31:57 PDT 2025
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/157638
>From ec26feacd05ee50507afd0cf5e14be1092c7cc53 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 2 Sep 2025 05:54:00 -0500
Subject: [PATCH 1/2] [flang][OpenMP] `do concurrent`: support `local` on
device
Extends support for mapping `do concurrent` on the device by adding
support for `local` specifiers. The changes in this PR map the local
variable to the `omp.target` op and uses the mapped value as the
`private` clause operand in the nested `omp.parallel` op.
---
.../include/flang/Optimizer/Dialect/FIROps.td | 12 ++
.../OpenMP/DoConcurrentConversion.cpp | 192 +++++++++++-------
.../Transforms/DoConcurrent/local_device.mlir | 49 +++++
3 files changed, 175 insertions(+), 78 deletions(-)
create mode 100644 flang/test/Transforms/DoConcurrent/local_device.mlir
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index bc971e8fd6600..fc6eedc6ed4c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3894,6 +3894,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
return getReduceVars().size();
}
+ unsigned getInductionVarsStart() {
+ return 0;
+ }
+
+ unsigned getLocalOperandsStart() {
+ return getNumInductionVars();
+ }
+
+ unsigned getReduceOperandsStart() {
+ return getLocalOperandsStart() + getNumLocalOperands();
+ }
+
mlir::Block::BlockArgListType getInductionVars() {
return getBody()->getArguments().slice(0, getNumInductionVars());
}
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 6c71924000842..d00a4fdd2cf2e 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -138,6 +138,9 @@ void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
liveIns.push_back(operand->get());
});
+
+ for (mlir::Value local : loop.getLocalVars())
+ liveIns.push_back(local);
}
/// Collects values that are local to a loop: "loop-local values". A loop-local
@@ -298,8 +301,7 @@ class DoConcurrentConversion
.getIsTargetDevice();
mlir::omp::TargetOperands targetClauseOps;
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
- loopNestClauseOps,
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
isTargetDevice ? nullptr : &targetClauseOps);
LiveInShapeInfoMap liveInShapeInfoMap;
@@ -321,14 +323,13 @@ class DoConcurrentConversion
}
mlir::omp::ParallelOp parallelOp =
- genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
+ genParallelOp(rewriter, loop, ivInfos, mapper);
// Only set as composite when part of `distribute parallel do`.
parallelOp.setComposite(mapToDevice);
if (!mapToDevice)
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
- loopNestClauseOps);
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);
for (mlir::Value local : locals)
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
@@ -337,10 +338,38 @@ class DoConcurrentConversion
if (mapToDevice)
genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
- mlir::omp::LoopNestOp ompLoopNest =
+ auto [loopNestOp, wsLoopOp] =
genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);
+ // `local` region arguments are transferred/cloned from the `do concurrent`
+ // loop to the loopnest op when the region is cloned above. Instead, these
+ // region arguments should be on the workshare loop's region.
+ if (mapToDevice) {
+ for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
+ parallelOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().slice(
+ loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
+ rewriter.replaceAllUsesWith(loopNestArg, parallelArg);
+
+ for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
+ wsLoopOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().slice(
+ loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
+ rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+ } else {
+ for (auto [wsloopArg, loopNestArg] :
+ llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().drop_front(
+ loopNestClauseOps.loopLowerBounds.size())))
+ rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+ }
+
+ for (unsigned i = 0;
+ i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
+ loopNestOp.getRegion().eraseArgument(
+ loopNestClauseOps.loopLowerBounds.size());
+
rewriter.setInsertionPoint(doLoop);
fir::FirOpBuilder builder(
rewriter,
@@ -361,7 +390,7 @@ class DoConcurrentConversion
// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
- ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+ loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
concurrentLoopsToSkip.insert(doLoop);
});
@@ -372,11 +401,21 @@ class DoConcurrentConversion
private:
mlir::omp::ParallelOp
- genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+ genParallelOp(mlir::ConversionPatternRewriter &rewriter,
+ fir::DoConcurrentLoopOp loop,
looputils::InductionVariableInfos &ivInfos,
mlir::IRMapping &mapper) const {
- auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
- rewriter.createBlock(¶llelOp.getRegion());
+ mlir::omp::ParallelOperands parallelOps;
+
+ if (mapToDevice)
+ genPrivatizers(rewriter, mapper, loop, parallelOps);
+
+ mlir::Location loc = loop.getLoc();
+ auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
+ Fortran::common::openmp::EntryBlockArgs parallelArgs;
+ parallelArgs.priv.vars = parallelOps.privateVars;
+ Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
+ parallelOp.getRegion());
rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
@@ -413,7 +452,7 @@ class DoConcurrentConversion
void genLoopNestClauseOps(
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
- fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+ fir::DoConcurrentLoopOp loop,
mlir::omp::LoopNestOperands &loopNestClauseOps,
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
@@ -444,59 +483,14 @@ class DoConcurrentConversion
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
}
- mlir::omp::LoopNestOp
+ std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
const mlir::omp::LoopNestOperands &clauseOps,
bool isComposite) const {
mlir::omp::WsloopOperands wsloopClauseOps;
-
- auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
- mlir::Region &ompRegion) {
- if (!firRegion.empty()) {
- rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
- auto firYield =
- mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
- rewriter.setInsertionPoint(firYield);
- mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
- firYield.getOperands());
- rewriter.eraseOp(firYield);
- }
- };
-
- // For `local` (and `local_init`) opernads, emit corresponding `private`
- // clauses and attach these clauses to the workshare loop.
- if (!loop.getLocalVars().empty())
- for (auto [op, sym, arg] : llvm::zip_equal(
- loop.getLocalVars(),
- loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
- loop.getRegionLocalArgs())) {
- auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
- sym.getLeafReference());
- if (localizer.getLocalitySpecifierType() ==
- fir::LocalitySpecifierType::LocalInit)
- TODO(localizer.getLoc(),
- "local_init conversion is not supported yet");
-
- mlir::OpBuilder::InsertionGuard guard(rewriter);
- rewriter.setInsertionPointAfter(localizer);
-
- auto privatizer = mlir::omp::PrivateClauseOp::create(
- rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
- localizer.getTypeAttr().getValue(),
- mlir::omp::DataSharingClauseType::Private);
-
- cloneFIRRegionToOMP(localizer.getInitRegion(),
- privatizer.getInitRegion());
- cloneFIRRegionToOMP(localizer.getDeallocRegion(),
- privatizer.getDeallocRegion());
-
- moduleSymbolTable.insert(privatizer);
-
- wsloopClauseOps.privateVars.push_back(op);
- wsloopClauseOps.privateSyms.push_back(
- mlir::SymbolRefAttr::get(privatizer));
- }
+ if (!mapToDevice)
+ genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);
if (!loop.getReduceVars().empty()) {
for (auto [op, byRef, sym, arg] : llvm::zip_equal(
@@ -519,15 +513,15 @@ class DoConcurrentConversion
rewriter, firReducer.getLoc(), ompReducerName,
firReducer.getTypeAttr().getValue());
- cloneFIRRegionToOMP(firReducer.getAllocRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
ompReducer.getAllocRegion());
- cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
ompReducer.getInitializerRegion());
- cloneFIRRegionToOMP(firReducer.getReductionRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
ompReducer.getReductionRegion());
- cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
ompReducer.getAtomicReductionRegion());
- cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
ompReducer.getCleanupRegion());
moduleSymbolTable.insert(ompReducer);
}
@@ -559,21 +553,10 @@ class DoConcurrentConversion
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
mlir::omp::YieldOp::create(rewriter, loop->getLoc());
+ loop->getParentOfType<mlir::ModuleOp>().print(
+ llvm::errs(), mlir::OpPrintingFlags().assumeVerified());
- // `local` region arguments are transferred/cloned from the `do concurrent`
- // loop to the loopnest op when the region is cloned above. Instead, these
- // region arguments should be on the workshare loop's region.
- for (auto [wsloopArg, loopNestArg] :
- llvm::zip_equal(wsloopOp.getRegion().getArguments(),
- loopNestOp.getRegion().getArguments().drop_front(
- clauseOps.loopLowerBounds.size())))
- rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
-
- for (unsigned i = 0;
- i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
- loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
-
- return loopNestOp;
+ return {loopNestOp, wsloopOp};
}
void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
@@ -817,6 +800,59 @@ class DoConcurrentConversion
return distOp;
}
+ void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
+ mlir::Region &firRegion,
+ mlir::Region &ompRegion) const {
+ if (!firRegion.empty()) {
+ rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
+ auto firYield =
+ mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
+ rewriter.setInsertionPoint(firYield);
+ mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
+ firYield.getOperands());
+ rewriter.eraseOp(firYield);
+ }
+ }
+
+ void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
+ mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
+ mlir::omp::PrivateClauseOps &privateClauseOps) const {
+ // For `local` (and `local_init`) operands, emit corresponding `private`
+ // clauses and attach these clauses to the workshare loop.
+ if (!loop.getLocalVars().empty())
+ for (auto [var, sym, arg] : llvm::zip_equal(
+ loop.getLocalVars(),
+ loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+ loop.getRegionLocalArgs())) {
+ auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
+ sym.getLeafReference());
+ if (localizer.getLocalitySpecifierType() ==
+ fir::LocalitySpecifierType::LocalInit)
+ TODO(localizer.getLoc(),
+ "local_init conversion is not supported yet");
+
+ mlir::OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.setInsertionPointAfter(localizer);
+
+ auto privatizer = mlir::omp::PrivateClauseOp::create(
+ rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+ localizer.getTypeAttr().getValue(),
+ mlir::omp::DataSharingClauseType::Private);
+
+ cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
+ privatizer.getInitRegion());
+ cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
+ privatizer.getDeallocRegion());
+
+ moduleSymbolTable.insert(privatizer);
+
+ privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
+ : var);
+ privateClauseOps.privateSyms.push_back(
+ mlir::SymbolRefAttr::get(privatizer));
+ }
+ }
+
bool mapToDevice;
llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
mlir::SymbolTable &moduleSymbolTable;
diff --git a/flang/test/Transforms/DoConcurrent/local_device.mlir b/flang/test/Transforms/DoConcurrent/local_device.mlir
new file mode 100644
index 0000000000000..e54bb1aeb414e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/local_device.mlir
@@ -0,0 +1,49 @@
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s
+
+fir.local {type = local} @_QFfooEmy_local_private_f32 : f32
+
+func.func @_QPfoo() {
+ %0 = fir.dummy_scope : !fir.dscope
+ %3 = fir.alloca f32 {bindc_name = "my_local", uniq_name = "_QFfooEmy_local"}
+ %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+
+ fir.do_concurrent {
+ %7 = fir.alloca i32 {bindc_name = "i"}
+ %8:2 = hlfir.declare %7 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+ fir.do_concurrent.loop (%arg0) = (%c1) to (%c10) step (%c1) local(@_QFfooEmy_local_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+ %9 = fir.convert %arg0 : (index) -> i32
+ fir.store %9 to %8#0 : !fir.ref<i32>
+ %10:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+ %cst = arith.constant 4.200000e+01 : f32
+ hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+ }
+ }
+ return
+}
+
+// CHECK: omp.private {type = private} @[[OMP_PRIVATIZER:.*.omp]] : f32
+
+// CHECK: %[[LOCAL_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}my_local"}
+// CHECK: %[[LOCAL_MAP:.*]] = omp.map.info var_ptr(%[[LOCAL_DECL]]#1 : {{.*}})
+
+// CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[LOCAL_MAP]] -> %[[LOCAL_MAP_ARG:.*]] : {{.*}}) {
+// CHECK: %[[LOCAL_DEV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_MAP_ARG]] {uniq_name = "_QFfooEmy_local"}
+
+// CHECK: omp.teams {
+// CHECK: omp.parallel private(@[[OMP_PRIVATIZER]] %[[LOCAL_DEV_DECL]]#0 -> %[[LOCAL_PRIV_ARG:.*]] : {{.*}}) {
+// CHECK: omp.distribute {
+// CHECK: omp.wsloop {
+// CHECK: omp.loop_nest {{.*}} {
+// CHECK: %[[LOCAL_LOOP_DECL:.*]]:2 = hlfir.declare %[[LOCAL_PRIV_ARG]] {uniq_name = "_QFfooEmy_local"}
+// CHECK: hlfir.assign %{{.*}} to %[[LOCAL_LOOP_DECL]]#0
+// CHECK: omp.yield
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
>From c59436f4343f992f34939b7b91e3fb3c01313d35 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Sun, 21 Sep 2025 01:31:36 -0500
Subject: [PATCH 2/2] add docs
---
flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index d00a4fdd2cf2e..fb99623128621 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -814,6 +814,14 @@ class DoConcurrentConversion
}
}
+ /// Generate bodies of OpenMP privatizers by cloning the bodies of FIR
+ /// privatizers.
+ ///
+ /// \param [in] rewriter - used to driver IR generation for privatizers.
+ /// \param [in] mapper - value mapping from FIR to OpenMP constructs.
+ /// \param [in] loop - FIR loop to convert its localizers.
+ ///
+ /// \param [out] privateClauseOps - OpenMP privatizers to gen their bodies.
void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
mlir::omp::PrivateClauseOps &privateClauseOps) const {
More information about the flang-commits
mailing list