[llvm-branch-commits] [flang] [flang][OpenMP] Update `do concurrent` mapping pass to use `fir.do_concurrent` op (PR #138489)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 5 00:25:59 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Kareem Ergawy (ergawy)
<details>
<summary>Changes</summary>
This PR updates the `do concurrent` to OpenMP mapping pass to use the newly added `fir.do_concurrent` ops that were recently added upstream instead of handling nests of `fir.do_loop ... unordered` ops.
Parent PR: https://github.com/llvm/llvm-project/pull/137928.
---
Patch is 34.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138489.diff
9 Files Affected:
- (modified) flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp (+84-278)
- (modified) flang/test/Transforms/DoConcurrent/basic_device.mlir (+13-11)
- (modified) flang/test/Transforms/DoConcurrent/basic_host.f90 (-3)
- (modified) flang/test/Transforms/DoConcurrent/basic_host.mlir (+14-12)
- (modified) flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 (-3)
- (removed) flang/test/Transforms/DoConcurrent/loop_nest_test.f90 (-92)
- (modified) flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 (-3)
- (modified) flang/test/Transforms/DoConcurrent/non_const_bounds.f90 (-3)
- (modified) flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 (+11-13)
``````````diff
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 2c069860ffdca..0fdb302fe10ca 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/OpenMP/Passes.h"
#include "flang/Optimizer/OpenMP/Utils.h"
@@ -28,8 +29,10 @@ namespace looputils {
/// Stores info needed about the induction/iteration variable for each `do
/// concurrent` in a loop nest.
struct InductionVariableInfo {
- InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
-
+ InductionVariableInfo(fir::DoConcurrentLoopOp loop,
+ mlir::Value inductionVar) {
+ populateInfo(loop, inductionVar);
+ }
/// The operation allocating memory for iteration variable.
mlir::Operation *iterVarMemDef;
/// the operation(s) updating the iteration variable with the current
@@ -45,7 +48,7 @@ struct InductionVariableInfo {
/// ...
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
/// ...
- /// fir.do_loop %ind_var = %lb to %ub step %s unordered {
+ /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
/// ...
@@ -62,14 +65,14 @@ struct InductionVariableInfo {
/// Note: The current implementation is dependent on how flang emits loop
/// bodies; which is sufficient for the current simple test/use cases. If this
/// proves to be insufficient, this should be made more generic.
- void populateInfo(fir::DoLoopOp doLoop) {
+ void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
mlir::Value result = nullptr;
// Checks if a StoreOp is updating the memref of the loop's iteration
// variable.
auto isStoringIV = [&](fir::StoreOp storeOp) {
// Direct store into the IV memref.
- if (storeOp.getValue() == doLoop.getInductionVar()) {
+ if (storeOp.getValue() == inductionVar) {
indVarUpdateOps.push_back(storeOp);
return true;
}
@@ -77,7 +80,7 @@ struct InductionVariableInfo {
// Indirect store into the IV memref.
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
storeOp.getValue().getDefiningOp())) {
- if (convertOp.getOperand() == doLoop.getInductionVar()) {
+ if (convertOp.getOperand() == inductionVar) {
indVarUpdateOps.push_back(convertOp);
indVarUpdateOps.push_back(storeOp);
return true;
@@ -87,7 +90,7 @@ struct InductionVariableInfo {
return false;
};
- for (mlir::Operation &op : doLoop) {
+ for (mlir::Operation &op : loop) {
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
if (isStoringIV(storeOp)) {
result = storeOp.getMemref();
@@ -100,219 +103,7 @@ struct InductionVariableInfo {
}
};
-using LoopNestToIndVarMap =
- llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
-
-/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
-/// there are no operations in \p outerloop's body other than:
-///
-/// 1. the operations needed to assign/update \p outerLoop's induction variable.
-/// 2. \p innerLoop itself.
-///
-/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
-/// according to the above definition.
-bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
- mlir::ForwardSliceOptions forwardSliceOptions;
- forwardSliceOptions.inclusive = true;
- // The following will be used as an example to clarify the internals of this
- // function:
- // ```
- // 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
- // 2. %i_idx_2 = fir.convert %i_idx : (index) -> i32
- // 3. fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
- //
- // 4. fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
- // 5. %j_idx_2 = fir.convert %j_idx : (index) -> i32
- // 6. fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
- // ... loop nest body, possible uses %i_idx ...
- // }
- // }
- // ```
- // In this example, the `j` loop is perfectly nested inside the `i` loop and
- // below is how we find that.
-
- // We don't care about the outer-loop's induction variable's uses within the
- // inner-loop, so we filter out these uses.
- //
- // This filter tells `getForwardSlice` (below) to only collect operations
- // which produce results defined above (i.e. outside) the inner-loop's body.
- //
- // Since `outerLoop.getInductionVar()` is a block argument (to the
- // outer-loop's body), the filter effectively collects uses of
- // `outerLoop.getInductionVar()` inside the outer-loop but outside the
- // inner-loop.
- forwardSliceOptions.filter = [&](mlir::Operation *op) {
- return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
- };
-
- llvm::SetVector<mlir::Operation *> indVarSlice;
- // The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
- // above. Uses of `%i_idx` inside the `j` loop are not collected because of
- // the filter.
- mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
- forwardSliceOptions);
- llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
- indVarSlice.end());
-
- llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
- // The following walk collects ops inside `outerLoop` that are **not**:
- // * the outer-loop itself,
- // * or the inner-loop,
- // * or the `fir.result` op (the outer-loop's terminator).
- //
- // For the above example, this will also populate `outerLoopBodySet` with ops
- // in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
- outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
- if (op == outerLoop)
- return mlir::WalkResult::advance();
-
- if (op == innerLoop)
- return mlir::WalkResult::skip();
-
- if (mlir::isa<fir::ResultOp>(op))
- return mlir::WalkResult::advance();
-
- outerLoopBodySet.insert(op);
- return mlir::WalkResult::advance();
- });
-
- // If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
- // `outerLoop` only contains ops that setup its induction variable +
- // `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
- // perfectly nested inside `outerLoop`.
- bool result = (outerLoopBodySet == indVarSet);
- LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
- << " is" << (result ? "" : " not")
- << " perfectly nested\n");
-
- return result;
-}
-
-/// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
-/// This function collects as much as possible loops in the nest; it case it
-/// fails to recognize a certain nested loop as part of the nest it just returns
-/// the parent loops it discovered before.
-mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
- LoopNestToIndVarMap &loopNest) {
- assert(currentLoop.getUnordered());
-
- while (true) {
- loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
- llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
-
- for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
- if (nestedLoop.getUnordered())
- unorderedLoops.push_back(nestedLoop);
-
- if (unorderedLoops.empty())
- break;
-
- // Having more than one unordered loop means that we are not dealing with a
- // perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
- // case we are after here.
- if (unorderedLoops.size() > 1)
- return mlir::failure();
-
- fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();
-
- if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
- return mlir::failure();
-
- currentLoop = nestedUnorderedLoop;
- }
-
- return mlir::success();
-}
-
-/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
-/// particular, this function would take this input IR:
-/// ```
-/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
-/// fir.store %i_iv to %i#1 : !fir.ref<i32>
-/// %j_lb = arith.constant 1 : i32
-/// %j_ub = arith.constant 10 : i32
-/// %j_step = arith.constant 1 : index
-///
-/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
-/// fir.store %j_iv to %j#1 : !fir.ref<i32>
-/// ...
-/// }
-/// }
-/// ```
-///
-/// into the following form (using generic op form since the result is
-/// technically an invalid `fir.do_loop` op:
-///
-/// ```
-/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
-/// ^bb0(%i_iv: index):
-/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
-/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
-/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
-///
-/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
-/// ^bb0(%new_i_iv: index, %new_j_iv: index):
-/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
-/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
-/// ...
-/// })
-/// ```
-///
-/// What happened to the loop nest is the following:
-///
-/// * the innermost loop's entry block was updated from having one operand to
-/// having `n` operands where `n` is the number of loops in the nest,
-///
-/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
-/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
-///
-/// * the innermost loop's entry block's arguments were mapped in order from the
-/// outermost to the innermost IV.
-///
-/// With this IR change, we can directly inline the innermost loop's region into
-/// the newly generated `omp.loop_nest` op.
-///
-/// Note that this function has a pre-condition that \p loopNest consists of
-/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
-/// loops except for the ops to setup the inner loop's LB, UB, and step. These
-/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
-void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest) {
- if (loopNest.size() <= 1)
- return;
-
- fir::DoLoopOp innermostLoop = loopNest.back().first;
- mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
-
- llvm::SmallVector<mlir::Type> argTypes;
- llvm::SmallVector<mlir::Location> argLocs;
-
- for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
- // Sink the IV update ops to the innermost loop. We need to do for all loops
- // except for the innermost one, hence the `drop_end` usage above.
- for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
- op->moveBefore(&innermostFirstOp);
-
- argTypes.push_back(doLoop.getInductionVar().getType());
- argLocs.push_back(doLoop.getInductionVar().getLoc());
- }
-
- mlir::Region &innermmostRegion = innermostLoop.getRegion();
- // Extend the innermost entry block with arguments to represent the outer IVs.
- innermmostRegion.addArguments(argTypes, argLocs);
-
- unsigned idx = 1;
- // In reverse, remap the IVs of the loop nest from the old values to the new
- // ones. We do that in reverse since the first argument before this loop is
- // the old IV for the innermost loop. Therefore, we want to replace it first
- // before the old value (1st argument in the block) is remapped to be the IV
- // of the outermost loop in the nest.
- for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
- doLoop.getInductionVar().replaceAllUsesWith(
- innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
- ++idx;
- }
-}
+using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
/// Collects values that are local to a loop: "loop-local values". A loop-local
/// value is one that is used exclusively inside the loop but allocated outside
@@ -326,9 +117,9 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
/// used exclusively inside.
///
/// \param [out] locals - the list of loop-local values detected for \p doLoop.
-void collectLoopLocalValues(fir::DoLoopOp doLoop,
+void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
llvm::SetVector<mlir::Value> &locals) {
- doLoop.walk([&](mlir::Operation *op) {
+ loop.walk([&](mlir::Operation *op) {
for (mlir::Value operand : op->getOperands()) {
if (locals.contains(operand))
continue;
@@ -340,11 +131,11 @@ void collectLoopLocalValues(fir::DoLoopOp doLoop,
// Values defined inside the loop are not interesting since they do not
// need to be localized.
- if (doLoop->isAncestor(operand.getDefiningOp()))
+ if (loop->isAncestor(operand.getDefiningOp()))
continue;
for (auto *user : operand.getUsers()) {
- if (!doLoop->isAncestor(user)) {
+ if (!loop->isAncestor(user)) {
isLocal = false;
break;
}
@@ -373,39 +164,42 @@ static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
}
} // namespace looputils
-class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+class DoConcurrentConversion
+ : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
public:
- using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+ using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
- DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
- llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
+ DoConcurrentConversion(
+ mlir::MLIRContext *context, bool mapToDevice,
+ llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip)
: OpConversionPattern(context), mapToDevice(mapToDevice),
concurrentLoopsToSkip(concurrentLoopsToSkip) {}
mlir::LogicalResult
- matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+ matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
if (mapToDevice)
return doLoop.emitError(
"not yet implemented: Mapping `do concurrent` loops to device");
- looputils::LoopNestToIndVarMap loopNest;
- bool hasRemainingNestedLoops =
- failed(looputils::collectLoopNest(doLoop, loopNest));
- if (hasRemainingNestedLoops)
- mlir::emitWarning(doLoop.getLoc(),
- "Some `do concurent` loops are not perfectly-nested. "
- "These will be serialized.");
+ looputils::InductionVariableInfos ivInfos;
+ auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
+ doLoop.getRegion().back().getTerminator());
+
+ auto indVars = loop.getLoopInductionVars();
+ assert(indVars.has_value());
+
+ for (mlir::Value indVar : *indVars)
+ ivInfos.emplace_back(loop, indVar);
llvm::SetVector<mlir::Value> locals;
- looputils::collectLoopLocalValues(loopNest.back().first, locals);
- looputils::sinkLoopIVArgs(rewriter, loopNest);
+ looputils::collectLoopLocalValues(loop, locals);
mlir::IRMapping mapper;
mlir::omp::ParallelOp parallelOp =
- genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
+ genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
mlir::omp::LoopNestOperands loopNestClauseOps;
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
loopNestClauseOps);
for (mlir::Value local : locals)
@@ -413,41 +207,56 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
rewriter);
mlir::omp::LoopNestOp ompLoopNest =
- genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
+ genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);
- rewriter.eraseOp(doLoop);
+ rewriter.setInsertionPoint(doLoop);
+ fir::FirOpBuilder builder(
+ rewriter,
+ fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
+
+ // Collect iteration variable(s) allocations so that we can move them
+ // outside the `fir.do_concurrent` wrapper (before erasing it).
+ llvm::SmallVector<mlir::Operation *> opsToMove;
+ for (mlir::Operation &op : llvm::drop_end(doLoop))
+ opsToMove.push_back(&op);
+
+ mlir::Block *allocBlock = builder.getAllocaBlock();
+
+ for (mlir::Operation *op : llvm::reverse(opsToMove)) {
+ rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
+ }
// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
- ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
- if (doLoop.getUnordered()) {
- concurrentLoopsToSkip.insert(doLoop);
- }
+ ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+ concurrentLoopsToSkip.insert(doLoop);
});
+ rewriter.eraseOp(doLoop);
+
return mlir::success();
}
private:
- mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
- mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest,
- mlir::IRMapping &mapper) const {
+ mlir::omp::ParallelOp
+ genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+ looputils::InductionVariableInfos &ivInfos,
+ mlir::IRMapping &mapper) const {
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
rewriter.createBlock(¶llelOp.getRegion());
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
- genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+ genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
return parallelOp;
}
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest,
+ looputils::InductionVariableInfos &ivInfos,
mlir::IRMapping &mapper) const {
- for (auto &[_, indVarInfo] : loopNest)
+ for (auto &indVarInfo : ivInfos)
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
}
@@ -471,10 +280,11 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
return result;
}
- void genLoopNestClauseOps(
- mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
- mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+ void
+ genLoopNestClauseOps(mlir::Location loc,
+ mlir::ConversionPatternRewriter &rewriter,
+ fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+ mlir::omp::LoopNestOperands &loopNestClauseOps) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
"Loop nest bounds were already emitted!");
@@ -483,43 +293,42 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
bounds.push_back(var.getDefiningOp()->getResult(0));
};
- for (auto &[doLoop, _] : loopNest) {
- populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
- populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
- populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
+ for (auto [lb, ub, st] : llvm::zip_equal(
+ loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
+ populateBounds(lb, loopNestClauseOps.loopLowerBounds);
+ populateBounds(ub, loopNestClauseOps.loopUpperBounds);
+ populateBounds(st, loopNestClauseOp...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/138489
More information about the llvm-branch-commits
mailing list