[llvm-branch-commits] [flang] [flang][OpenMP] Update `do concurrent` mapping pass to use `fir.do_concurrent` op (PR #138489)
Kareem Ergawy via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 5 01:30:24 PDT 2025
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/138489
>From ac0cde7576c49a00e3591254d6891879b52bee81 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 5 May 2025 02:23:04 -0500
Subject: [PATCH] [flang][OpenMP] Update `do concurrent` mapping pass to use
`fir.do_concurrent` op
This PR updates the `do concurrent` to OpenMP mapping pass to use the newly added `fir.do_concurrent` ops that were recently added upstream instead of handling nests of `fir.do_loop ... unordered` ops.
Parent PR: https://github.com/llvm/llvm-project/pull/137928.
---
.../OpenMP/DoConcurrentConversion.cpp | 362 ++++--------------
.../Transforms/DoConcurrent/basic_device.mlir | 24 +-
.../Transforms/DoConcurrent/basic_host.f90 | 3 -
.../Transforms/DoConcurrent/basic_host.mlir | 26 +-
.../DoConcurrent/locally_destroyed_temp.f90 | 3 -
.../DoConcurrent/loop_nest_test.f90 | 92 -----
.../multiple_iteration_ranges.f90 | 3 -
.../DoConcurrent/non_const_bounds.f90 | 3 -
.../DoConcurrent/not_perfectly_nested.f90 | 24 +-
9 files changed, 122 insertions(+), 418 deletions(-)
delete mode 100644 flang/test/Transforms/DoConcurrent/loop_nest_test.f90
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 2c069860ffdca..0fdb302fe10ca 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/OpenMP/Passes.h"
#include "flang/Optimizer/OpenMP/Utils.h"
@@ -28,8 +29,10 @@ namespace looputils {
/// Stores info needed about the induction/iteration variable for each `do
/// concurrent` in a loop nest.
struct InductionVariableInfo {
- InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
-
+ InductionVariableInfo(fir::DoConcurrentLoopOp loop,
+ mlir::Value inductionVar) {
+ populateInfo(loop, inductionVar);
+ }
/// The operation allocating memory for iteration variable.
mlir::Operation *iterVarMemDef;
/// the operation(s) updating the iteration variable with the current
@@ -45,7 +48,7 @@ struct InductionVariableInfo {
/// ...
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
/// ...
- /// fir.do_loop %ind_var = %lb to %ub step %s unordered {
+ /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
/// ...
@@ -62,14 +65,14 @@ struct InductionVariableInfo {
/// Note: The current implementation is dependent on how flang emits loop
/// bodies; which is sufficient for the current simple test/use cases. If this
/// proves to be insufficient, this should be made more generic.
- void populateInfo(fir::DoLoopOp doLoop) {
+ void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
mlir::Value result = nullptr;
// Checks if a StoreOp is updating the memref of the loop's iteration
// variable.
auto isStoringIV = [&](fir::StoreOp storeOp) {
// Direct store into the IV memref.
- if (storeOp.getValue() == doLoop.getInductionVar()) {
+ if (storeOp.getValue() == inductionVar) {
indVarUpdateOps.push_back(storeOp);
return true;
}
@@ -77,7 +80,7 @@ struct InductionVariableInfo {
// Indirect store into the IV memref.
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
storeOp.getValue().getDefiningOp())) {
- if (convertOp.getOperand() == doLoop.getInductionVar()) {
+ if (convertOp.getOperand() == inductionVar) {
indVarUpdateOps.push_back(convertOp);
indVarUpdateOps.push_back(storeOp);
return true;
@@ -87,7 +90,7 @@ struct InductionVariableInfo {
return false;
};
- for (mlir::Operation &op : doLoop) {
+ for (mlir::Operation &op : loop) {
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
if (isStoringIV(storeOp)) {
result = storeOp.getMemref();
@@ -100,219 +103,7 @@ struct InductionVariableInfo {
}
};
-using LoopNestToIndVarMap =
- llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
-
-/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
-/// there are no operations in \p outerloop's body other than:
-///
-/// 1. the operations needed to assign/update \p outerLoop's induction variable.
-/// 2. \p innerLoop itself.
-///
-/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
-/// according to the above definition.
-bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
- mlir::ForwardSliceOptions forwardSliceOptions;
- forwardSliceOptions.inclusive = true;
- // The following will be used as an example to clarify the internals of this
- // function:
- // ```
- // 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
- // 2. %i_idx_2 = fir.convert %i_idx : (index) -> i32
- // 3. fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
- //
- // 4. fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
- // 5. %j_idx_2 = fir.convert %j_idx : (index) -> i32
- // 6. fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
- // ... loop nest body, possible uses %i_idx ...
- // }
- // }
- // ```
- // In this example, the `j` loop is perfectly nested inside the `i` loop and
- // below is how we find that.
-
- // We don't care about the outer-loop's induction variable's uses within the
- // inner-loop, so we filter out these uses.
- //
- // This filter tells `getForwardSlice` (below) to only collect operations
- // which produce results defined above (i.e. outside) the inner-loop's body.
- //
- // Since `outerLoop.getInductionVar()` is a block argument (to the
- // outer-loop's body), the filter effectively collects uses of
- // `outerLoop.getInductionVar()` inside the outer-loop but outside the
- // inner-loop.
- forwardSliceOptions.filter = [&](mlir::Operation *op) {
- return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
- };
-
- llvm::SetVector<mlir::Operation *> indVarSlice;
- // The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
- // above. Uses of `%i_idx` inside the `j` loop are not collected because of
- // the filter.
- mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
- forwardSliceOptions);
- llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
- indVarSlice.end());
-
- llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
- // The following walk collects ops inside `outerLoop` that are **not**:
- // * the outer-loop itself,
- // * or the inner-loop,
- // * or the `fir.result` op (the outer-loop's terminator).
- //
- // For the above example, this will also populate `outerLoopBodySet` with ops
- // in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
- outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
- if (op == outerLoop)
- return mlir::WalkResult::advance();
-
- if (op == innerLoop)
- return mlir::WalkResult::skip();
-
- if (mlir::isa<fir::ResultOp>(op))
- return mlir::WalkResult::advance();
-
- outerLoopBodySet.insert(op);
- return mlir::WalkResult::advance();
- });
-
- // If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
- // `outerLoop` only contains ops that setup its induction variable +
- // `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
- // perfectly nested inside `outerLoop`.
- bool result = (outerLoopBodySet == indVarSet);
- LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
- << " is" << (result ? "" : " not")
- << " perfectly nested\n");
-
- return result;
-}
-
-/// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
-/// This function collects as much as possible loops in the nest; it case it
-/// fails to recognize a certain nested loop as part of the nest it just returns
-/// the parent loops it discovered before.
-mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
- LoopNestToIndVarMap &loopNest) {
- assert(currentLoop.getUnordered());
-
- while (true) {
- loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
- llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
-
- for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
- if (nestedLoop.getUnordered())
- unorderedLoops.push_back(nestedLoop);
-
- if (unorderedLoops.empty())
- break;
-
- // Having more than one unordered loop means that we are not dealing with a
- // perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
- // case we are after here.
- if (unorderedLoops.size() > 1)
- return mlir::failure();
-
- fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();
-
- if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
- return mlir::failure();
-
- currentLoop = nestedUnorderedLoop;
- }
-
- return mlir::success();
-}
-
-/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
-/// particular, this function would take this input IR:
-/// ```
-/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
-/// fir.store %i_iv to %i#1 : !fir.ref<i32>
-/// %j_lb = arith.constant 1 : i32
-/// %j_ub = arith.constant 10 : i32
-/// %j_step = arith.constant 1 : index
-///
-/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
-/// fir.store %j_iv to %j#1 : !fir.ref<i32>
-/// ...
-/// }
-/// }
-/// ```
-///
-/// into the following form (using generic op form since the result is
-/// technically an invalid `fir.do_loop` op:
-///
-/// ```
-/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
-/// ^bb0(%i_iv: index):
-/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
-/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
-/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
-///
-/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
-/// ^bb0(%new_i_iv: index, %new_j_iv: index):
-/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
-/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
-/// ...
-/// })
-/// ```
-///
-/// What happened to the loop nest is the following:
-///
-/// * the innermost loop's entry block was updated from having one operand to
-/// having `n` operands where `n` is the number of loops in the nest,
-///
-/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
-/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
-///
-/// * the innermost loop's entry block's arguments were mapped in order from the
-/// outermost to the innermost IV.
-///
-/// With this IR change, we can directly inline the innermost loop's region into
-/// the newly generated `omp.loop_nest` op.
-///
-/// Note that this function has a pre-condition that \p loopNest consists of
-/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
-/// loops except for the ops to setup the inner loop's LB, UB, and step. These
-/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
-void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest) {
- if (loopNest.size() <= 1)
- return;
-
- fir::DoLoopOp innermostLoop = loopNest.back().first;
- mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
-
- llvm::SmallVector<mlir::Type> argTypes;
- llvm::SmallVector<mlir::Location> argLocs;
-
- for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
- // Sink the IV update ops to the innermost loop. We need to do for all loops
- // except for the innermost one, hence the `drop_end` usage above.
- for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
- op->moveBefore(&innermostFirstOp);
-
- argTypes.push_back(doLoop.getInductionVar().getType());
- argLocs.push_back(doLoop.getInductionVar().getLoc());
- }
-
- mlir::Region &innermmostRegion = innermostLoop.getRegion();
- // Extend the innermost entry block with arguments to represent the outer IVs.
- innermmostRegion.addArguments(argTypes, argLocs);
-
- unsigned idx = 1;
- // In reverse, remap the IVs of the loop nest from the old values to the new
- // ones. We do that in reverse since the first argument before this loop is
- // the old IV for the innermost loop. Therefore, we want to replace it first
- // before the old value (1st argument in the block) is remapped to be the IV
- // of the outermost loop in the nest.
- for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
- doLoop.getInductionVar().replaceAllUsesWith(
- innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
- ++idx;
- }
-}
+using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
/// Collects values that are local to a loop: "loop-local values". A loop-local
/// value is one that is used exclusively inside the loop but allocated outside
@@ -326,9 +117,9 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
/// used exclusively inside.
///
/// \param [out] locals - the list of loop-local values detected for \p doLoop.
-void collectLoopLocalValues(fir::DoLoopOp doLoop,
+void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
llvm::SetVector<mlir::Value> &locals) {
- doLoop.walk([&](mlir::Operation *op) {
+ loop.walk([&](mlir::Operation *op) {
for (mlir::Value operand : op->getOperands()) {
if (locals.contains(operand))
continue;
@@ -340,11 +131,11 @@ void collectLoopLocalValues(fir::DoLoopOp doLoop,
// Values defined inside the loop are not interesting since they do not
// need to be localized.
- if (doLoop->isAncestor(operand.getDefiningOp()))
+ if (loop->isAncestor(operand.getDefiningOp()))
continue;
for (auto *user : operand.getUsers()) {
- if (!doLoop->isAncestor(user)) {
+ if (!loop->isAncestor(user)) {
isLocal = false;
break;
}
@@ -373,39 +164,42 @@ static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
}
} // namespace looputils
-class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+class DoConcurrentConversion
+ : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
public:
- using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+ using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
- DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
- llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
+ DoConcurrentConversion(
+ mlir::MLIRContext *context, bool mapToDevice,
+ llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip)
: OpConversionPattern(context), mapToDevice(mapToDevice),
concurrentLoopsToSkip(concurrentLoopsToSkip) {}
mlir::LogicalResult
- matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+ matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
if (mapToDevice)
return doLoop.emitError(
"not yet implemented: Mapping `do concurrent` loops to device");
- looputils::LoopNestToIndVarMap loopNest;
- bool hasRemainingNestedLoops =
- failed(looputils::collectLoopNest(doLoop, loopNest));
- if (hasRemainingNestedLoops)
- mlir::emitWarning(doLoop.getLoc(),
- "Some `do concurent` loops are not perfectly-nested. "
- "These will be serialized.");
+ looputils::InductionVariableInfos ivInfos;
+ auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
+ doLoop.getRegion().back().getTerminator());
+
+ auto indVars = loop.getLoopInductionVars();
+ assert(indVars.has_value());
+
+ for (mlir::Value indVar : *indVars)
+ ivInfos.emplace_back(loop, indVar);
llvm::SetVector<mlir::Value> locals;
- looputils::collectLoopLocalValues(loopNest.back().first, locals);
- looputils::sinkLoopIVArgs(rewriter, loopNest);
+ looputils::collectLoopLocalValues(loop, locals);
mlir::IRMapping mapper;
mlir::omp::ParallelOp parallelOp =
- genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
+ genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
mlir::omp::LoopNestOperands loopNestClauseOps;
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
loopNestClauseOps);
for (mlir::Value local : locals)
@@ -413,41 +207,56 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
rewriter);
mlir::omp::LoopNestOp ompLoopNest =
- genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
+ genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);
- rewriter.eraseOp(doLoop);
+ rewriter.setInsertionPoint(doLoop);
+ fir::FirOpBuilder builder(
+ rewriter,
+ fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
+
+ // Collect iteration variable(s) allocations so that we can move them
+ // outside the `fir.do_concurrent` wrapper (before erasing it).
+ llvm::SmallVector<mlir::Operation *> opsToMove;
+ for (mlir::Operation &op : llvm::drop_end(doLoop))
+ opsToMove.push_back(&op);
+
+ mlir::Block *allocBlock = builder.getAllocaBlock();
+
+ for (mlir::Operation *op : llvm::reverse(opsToMove)) {
+ rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
+ }
// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
- ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
- if (doLoop.getUnordered()) {
- concurrentLoopsToSkip.insert(doLoop);
- }
+ ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+ concurrentLoopsToSkip.insert(doLoop);
});
+ rewriter.eraseOp(doLoop);
+
return mlir::success();
}
private:
- mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
- mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest,
- mlir::IRMapping &mapper) const {
+ mlir::omp::ParallelOp
+ genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+ looputils::InductionVariableInfos &ivInfos,
+ mlir::IRMapping &mapper) const {
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
rewriter.createBlock(¶llelOp.getRegion());
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
- genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+ genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
return parallelOp;
}
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest,
+ looputils::InductionVariableInfos &ivInfos,
mlir::IRMapping &mapper) const {
- for (auto &[_, indVarInfo] : loopNest)
+ for (auto &indVarInfo : ivInfos)
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
}
@@ -471,10 +280,11 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
return result;
}
- void genLoopNestClauseOps(
- mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
- looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
- mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+ void
+ genLoopNestClauseOps(mlir::Location loc,
+ mlir::ConversionPatternRewriter &rewriter,
+ fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+ mlir::omp::LoopNestOperands &loopNestClauseOps) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
"Loop nest bounds were already emitted!");
@@ -483,43 +293,42 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
bounds.push_back(var.getDefiningOp()->getResult(0));
};
- for (auto &[doLoop, _] : loopNest) {
- populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
- populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
- populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
+ for (auto [lb, ub, st] : llvm::zip_equal(
+ loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
+ populateBounds(lb, loopNestClauseOps.loopLowerBounds);
+ populateBounds(ub, loopNestClauseOps.loopUpperBounds);
+ populateBounds(st, loopNestClauseOps.loopSteps);
}
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
}
mlir::omp::LoopNestOp
- genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
- mlir::IRMapping &mapper,
+ genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
+ fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
const mlir::omp::LoopNestOperands &clauseOps,
bool isComposite) const {
- auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+ auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(loop.getLoc());
wsloopOp.setComposite(isComposite);
rewriter.createBlock(&wsloopOp.getRegion());
auto loopNestOp =
- rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+ rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
// Clone the loop's body inside the loop nest construct using the
// mapped values.
- rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
+ rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
loopNestOp.getRegion().begin(), mapper);
- mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
- rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
- rewriter.eraseOp(terminator);
+ rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
return loopNestOp;
}
bool mapToDevice;
- llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
+ llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
};
class DoConcurrentConversionPass
@@ -548,19 +357,16 @@ class DoConcurrentConversionPass
return;
}
- llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
+ llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
mlir::RewritePatternSet patterns(context);
patterns.insert<DoConcurrentConversion>(
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
concurrentLoopsToSkip);
mlir::ConversionTarget target(*context);
- target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
- // The goal is to handle constructs that eventually get lowered to
- // `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
- // Currently, this is only enabled for the `do concurrent` construct since
- // the pass runs early in the pipeline.
- return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
- });
+ target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
+ [&](fir::DoConcurrentOp op) {
+ return concurrentLoopsToSkip.contains(op);
+ });
target.markUnknownOpDynamicallyLegal(
[](mlir::Operation *) { return true; });
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
index d7fcc40e4a7f9..0ca48943864c8 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -1,8 +1,6 @@
// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
- %0 = fir.alloca i32 {bindc_name = "i"}
- %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
%2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
%c10 = arith.constant 10 : index
%3 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -14,15 +12,19 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
%c1 = arith.constant 1 : index
// expected-error at +2 {{not yet implemented: Mapping `do concurrent` loops to device}}
- // expected-error at below {{failed to legalize operation 'fir.do_loop'}}
- fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
- %13 = fir.convert %arg0 : (index) -> i32
- fir.store %13 to %1#1 : !fir.ref<i32>
- %14 = fir.load %1#0 : !fir.ref<i32>
- %15 = fir.load %1#0 : !fir.ref<i32>
- %16 = fir.convert %15 : (i32) -> i64
- %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
- hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ // expected-error at below {{failed to legalize operation 'fir.do_concurrent'}}
+ fir.do_concurrent {
+ %0 = fir.alloca i32 {bindc_name = "i"}
+ %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) {
+ %13 = fir.convert %arg0 : (index) -> i32
+ fir.store %13 to %1#1 : !fir.ref<i32>
+ %14 = fir.load %1#0 : !fir.ref<i32>
+ %15 = fir.load %1#0 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> i64
+ %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ }
}
return
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index b84d4481ac766..12f63031cbaee 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir
index b53ecd687c039..5425829404d7b 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir
@@ -6,8 +6,6 @@
func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
// CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
- %0 = fir.alloca i32 {bindc_name = "i"}
- %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
%2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
%c10 = arith.constant 10 : index
%3 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -18,7 +16,7 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
%8 = fir.convert %c10_i32 : (i32) -> index
%c1 = arith.constant 1 : index
- // CHECK-NOT: fir.do_loop
+ // CHECK-NOT: fir.do_concurrent
// CHECK: %[[C1:.*]] = arith.constant 1 : i32
// CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
@@ -46,17 +44,21 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
- fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
- %13 = fir.convert %arg0 : (index) -> i32
- fir.store %13 to %1#1 : !fir.ref<i32>
- %14 = fir.load %1#0 : !fir.ref<i32>
- %15 = fir.load %1#0 : !fir.ref<i32>
- %16 = fir.convert %15 : (i32) -> i64
- %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
- hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ fir.do_concurrent {
+ %0 = fir.alloca i32 {bindc_name = "i"}
+ %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) {
+ %13 = fir.convert %arg0 : (index) -> i32
+ fir.store %13 to %1#1 : !fir.ref<i32>
+ %14 = fir.load %1#0 : !fir.ref<i32>
+ %15 = fir.load %1#0 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> i64
+ %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ }
}
- // CHECK-NOT: fir.do_loop
+ // CHECK-NOT: fir.do_concurrent
return
}
diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index 4e13c0919589a..f82696669eca6 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
! Tests that "loop-local values" are properly handled by localizing them to the
! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue`
! for a definition of "loop-local values" and how they are handled.
diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
deleted file mode 100644
index adc4a488d1ec9..0000000000000
--- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
+++ /dev/null
@@ -1,92 +0,0 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
-! Tests loop-nest detection algorithm for do-concurrent mapping.
-
-! REQUIRES: asserts
-
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host \
-! RUN: -mmlir -debug -mmlir -mlir-disable-threading %s -o - 2> %t.log || true
-
-! RUN: FileCheck %s < %t.log
-
-program main
- implicit none
-
-contains
-
-subroutine foo(n)
- implicit none
- integer :: n, m
- integer :: i, j, k
- integer :: x
- integer, dimension(n) :: a
- integer, dimension(n, n, n) :: b
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
- do concurrent(i=1:n, j=1:bar(n*m, n/m))
- a(i) = n
- end do
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
- do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m))
- a(i) = n
- end do
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
- do concurrent(i=bar(n, x):n)
- do concurrent(j=1:bar(n*m, n/m))
- a(i) = n
- end do
- end do
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
- do concurrent(i=1:n)
- x = 10
- do concurrent(j=1:m)
- b(i,j,k) = i * j + k
- end do
- end do
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
- do concurrent(i=1:n)
- do concurrent(j=1:m)
- b(i,j,k) = i * j + k
- end do
- x = 10
- end do
-
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
- do concurrent(i=1:n)
- do concurrent(j=1:m)
- b(i,j,k) = i * j + k
- x = 10
- end do
- end do
-
- ! Verify the (i,j) and (j,k) pairs of loops are detected as perfectly nested.
- !
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 3]]:{{.*}}) is perfectly nested
- ! CHECK: Loop pair starting at location
- ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
- do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m), k=1:bar(n*m, bar(n*m, n/m)))
- a(i) = n
- end do
-end subroutine
-
-pure function bar(n, m)
- implicit none
- integer, intent(in) :: n, m
- integer :: bar
-
- bar = n + m
-end function
-
-end program main
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index 26800678d381c..d0210726de83e 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
! RUN: split-file %s %t
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index 23a3aae976c07..cd1bd4f98a3f5 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
! RUN: | FileCheck %s
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index d1c02101318ab..74799359e0476 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
! we skip converting the not-perfectly nested `do concurrent` loop.
@@ -22,23 +19,24 @@ program main
end do
end
-! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
-
-! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
-
! CHECK: omp.parallel {
! CHECK: omp.wsloop {
! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
-! CHECK: fir.do_loop %[[J_IV:.*]] = {{.*}} {
-! CHECK: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! CHECK: fir.do_concurrent {
+
+! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+
+! CHECK: fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
+! CHECK: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
! CHECK: fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
-! CHECK: fir.do_loop %[[K_IV:.*]] = {{.*}} {
! CHECK: %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
-! CHECK: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! CHECK: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
! CHECK: }
! CHECK: }
! CHECK: omp.yield
More information about the llvm-branch-commits
mailing list