[flang-commits] [flang] [flang][OpenMP] Map simple `do concurrent` loops to OpenMP host constructs (PR #127633)
Kareem Ergawy via flang-commits
flang-commits at lists.llvm.org
Wed Apr 2 01:14:25 PDT 2025
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127633
>From 20619fdbef2d6b1a607059d8e9d2017eea9e6ad6 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 18 Feb 2025 02:50:46 -0600
Subject: [PATCH 1/3] [flang][OpenMP] Map simple `do concurrent` loops to
OpenMP host constructs
Upstreams one more part of the ROCm `do concurrent` to OpenMP mapping
pass. This PR add support for converting simple loops to the equivalent
OpenMP constructs on the host: `omp parallel do`. Towards that end, we
have to collect more information about loop nests for which we add new
utils in the `looputils` name space.
---
flang/docs/DoConcurrentConversionToOpenMP.md | 47 ++++
.../OpenMP/DoConcurrentConversion.cpp | 211 +++++++++++++++++-
.../Transforms/DoConcurrent/basic_host.f90 | 14 +-
.../Transforms/DoConcurrent/basic_host.mlir | 62 +++++
.../DoConcurrent/non_const_bounds.f90 | 45 ++++
.../DoConcurrent/not_perfectly_nested.f90 | 45 ++++
6 files changed, 405 insertions(+), 19 deletions(-)
create mode 100644 flang/test/Transforms/DoConcurrent/basic_host.mlir
create mode 100644 flang/test/Transforms/DoConcurrent/non_const_bounds.f90
create mode 100644 flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md
index 7b49af742f242..19611615ee9d6 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -126,6 +126,53 @@ see the "Data environment" section below.
See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
of what is and is not detected as a perfect loop nest.
+### Single-range loops
+
+Given the following loop:
+```fortran
+ do concurrent(i=1:n)
+ a(i) = i * i
+ end do
+```
+
+#### Mapping to `host`
+
+Mapping this loop to the `host`, generates MLIR operations of the following
+structure:
+
+```
+%4 = fir.address_of(@_QFEa) ...
+%6:2 = hlfir.declare %4 ...
+
+omp.parallel {
+ // Allocate private copy for `i`.
+ // TODO Use delayed privatization.
+ %19 = fir.alloca i32 {bindc_name = "i"}
+ %20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...
+
+ omp.wsloop {
+ omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
+ %23 = fir.convert %arg0 : (index) -> i32
+ // Use the privatized version of `i`.
+ fir.store %23 to %20#1 : !fir.ref<i32>
+ ...
+
+ // Use "shared" SSA value of `a`.
+ %42 = hlfir.designate %6#0
+ hlfir.assign %35 to %42
+ ...
+ omp.yield
+ }
+ omp.terminator
+ }
+ omp.terminator
+}
+```
+
+#### Mapping to `device`
+
+<!-- TODO -->
+
<!--
More details about current status will be added along with relevant parts of the
implementation in later upstreaming patches.
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index ad88b42ac6d7a..4b9024e148621 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -11,6 +11,7 @@
#include "flang/Optimizer/OpenMP/Utils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/IRMapping.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/RegionUtils.h"
@@ -24,7 +25,82 @@ namespace flangomp {
namespace {
namespace looputils {
-using LoopNest = llvm::SetVector<fir::DoLoopOp>;
+/// Stores info needed about the induction/iteration variable for each `do
+/// concurrent` in a loop nest.
+struct InductionVariableInfo {
+ /// the operation allocating memory for iteration variable,
+ mlir::Operation *iterVarMemDef;
+};
+
+using LoopNestToIndVarMap =
+ llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
+
+/// Given an operation `op`, this returns true if one of `op`'s operands is
+/// "ultimately" the loop's induction variable. This helps in cases where the
+/// induction variable's use is "hidden" behind a convert/cast.
+///
+/// For example, give the following loop:
+/// ```
+/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
+/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
+/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
+/// ...
+/// }
+/// ```
+///
+/// If \p op is the `fir.store` operation, then this function will return true
+/// since the IV is the "ultimate" operand to the `fir.store` op through the
+/// `%ind_var_conv` -> `%ind_var` conversion sequence.
+///
+/// For why this is useful, see its use in `findLoopIndVarMemDecl`.
+bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
+ while (op != nullptr && op->getNumOperands() > 0) {
+ auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) {
+ return operand == doLoop.getInductionVar();
+ });
+
+ if (ivIt != op->getOperands().end())
+ return true;
+
+ op = op->getOperand(0).getDefiningOp();
+ }
+
+ return false;
+}
+
+/// For the \p doLoop parameter, find the operation that declares its iteration
+/// variable or allocates memory for it.
+///
+/// For example, give the following loop:
+/// ```
+/// ...
+/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
+/// ...
+/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
+/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
+/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
+/// ...
+/// }
+/// ```
+///
+/// This function returns the `hlfir.declare` op for `%i`.
+mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
+ mlir::Value result = nullptr;
+ mlir::visitUsedValuesDefinedAbove(
+ doLoop.getRegion(), [&](mlir::OpOperand *operand) {
+ if (result)
+ return;
+
+ if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
+ assert(result == nullptr &&
+ "loop can have only one induction variable");
+ result = operand->get();
+ }
+ });
+
+ assert(result != nullptr && result.getDefiningOp() != nullptr);
+ return result.getDefiningOp();
+}
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
/// there are no operations in \p outerloop's body other than:
@@ -116,11 +192,14 @@ bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
/// fails to recognize a certain nested loop as part of the nest it just returns
/// the parent loops it discovered before.
mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
- LoopNest &loopNest) {
+ LoopNestToIndVarMap &loopNest) {
assert(currentLoop.getUnordered());
while (true) {
- loopNest.insert(currentLoop);
+ loopNest.insert(
+ {currentLoop,
+ InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
+
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
@@ -152,13 +231,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
public:
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
- DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice)
- : OpConversionPattern(context), mapToDevice(mapToDevice) {}
+ DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
+ llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
+ : OpConversionPattern(context), mapToDevice(mapToDevice),
+ concurrentLoopsToSkip(concurrentLoopsToSkip) {}
mlir::LogicalResult
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
- looputils::LoopNest loopNest;
+ looputils::LoopNestToIndVarMap loopNest;
bool hasRemainingNestedLoops =
failed(looputils::collectLoopNest(doLoop, loopNest));
if (hasRemainingNestedLoops)
@@ -166,12 +247,120 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
"Some `do concurent` loops are not perfectly-nested. "
"These will be serialized.");
- // TODO This will be filled in with the next PRs that upstreams the rest of
- // the ROCm implementaion.
+ mlir::IRMapping mapper;
+ genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
+ mlir::omp::LoopNestOperands loopNestClauseOps;
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+ loopNestClauseOps);
+
+ mlir::omp::LoopNestOp ompLoopNest =
+ genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
+ /*isComposite=*/mapToDevice);
+
+ rewriter.eraseOp(doLoop);
+
+ // Mark `unordered` loops that are not perfectly nested to be skipped from
+ // the legality check of the `ConversionTarget` since we are not interested
+ // in mapping them to OpenMP.
+ ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
+ if (doLoop.getUnordered()) {
+ concurrentLoopsToSkip.insert(doLoop);
+ }
+ });
+
return mlir::success();
}
+private:
+ mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
+ mlir::ConversionPatternRewriter &rewriter,
+ looputils::LoopNestToIndVarMap &loopNest,
+ mlir::IRMapping &mapper) const {
+ auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+ rewriter.createBlock(¶llelOp.getRegion());
+ rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+
+ genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+ return parallelOp;
+ }
+
+ void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
+ looputils::LoopNestToIndVarMap &loopNest,
+ mlir::IRMapping &mapper) const {
+
+ for (auto &[_, indVarInfo] : loopNest)
+ genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
+ }
+
+ mlir::Operation *
+ genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
+ mlir::Operation *indVarMemDef,
+ mlir::IRMapping &mapper) const {
+ assert(
+ indVarMemDef != nullptr &&
+ "Induction variable memdef is expected to have a defining operation.");
+
+ llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
+ for (auto operand : indVarMemDef->getOperands())
+ indVarDeclareAndAlloc.insert(operand.getDefiningOp());
+ indVarDeclareAndAlloc.insert(indVarMemDef);
+
+ mlir::Operation *result;
+ for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
+ result = rewriter.clone(*opToClone, mapper);
+
+ return result;
+ }
+
+ void genLoopNestClauseOps(
+ mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+ looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
+ mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+ assert(loopNestClauseOps.loopLowerBounds.empty() &&
+ "Loop nest bounds were already emitted!");
+
+ auto populateBounds = [&](mlir::Value var,
+ llvm::SmallVectorImpl<mlir::Value> &bounds) {
+ bounds.push_back(var.getDefiningOp()->getResult(0));
+ };
+
+ for (auto &[doLoop, _] : loopNest) {
+ populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
+ populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
+ populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
+ }
+
+ loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
+ }
+
+ mlir::omp::LoopNestOp
+ genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
+ mlir::IRMapping &mapper,
+ const mlir::omp::LoopNestOperands &clauseOps,
+ bool isComposite) const {
+
+ auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+ wsloopOp.setComposite(isComposite);
+ rewriter.createBlock(&wsloopOp.getRegion());
+
+ auto loopNestOp =
+ rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+
+ // Clone the loop's body inside the loop nest construct using the
+ // mapped values.
+ rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
+ loopNestOp.getRegion().begin(), mapper);
+
+ mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
+ rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
+ rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
+ rewriter.eraseOp(terminator);
+
+ return loopNestOp;
+ }
+
bool mapToDevice;
+ llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
};
class DoConcurrentConversionPass
@@ -200,16 +389,18 @@ class DoConcurrentConversionPass
return;
}
+ llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
mlir::RewritePatternSet patterns(context);
patterns.insert<DoConcurrentConversion>(
- context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device);
+ context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
+ concurrentLoopsToSkip);
mlir::ConversionTarget target(*context);
target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
// The goal is to handle constructs that eventually get lowered to
// `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
// Currently, this is only enabled for the `do concurrent` construct since
// the pass runs early in the pipeline.
- return !op.getUnordered();
+ return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
});
target.markUnknownOpDynamicallyLegal(
[](mlir::Operation *) { return true; });
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index b569668ab0f0e..b80bc2f2dd038 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,7 +1,3 @@
-! Mark as xfail for now until we upstream the relevant part. This is just for
-! demo purposes at this point. Upstreaming this is the next step.
-! XFAIL: *
-
! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
@@ -19,17 +15,17 @@ program do_concurrent_basic
! CHECK-NOT: fir.do_loop
- ! CHECK: omp.parallel {
-
- ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
- ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
! CHECK: %[[C1:.*]] = arith.constant 1 : i32
! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
! CHECK: %[[C10:.*]] = arith.constant 10 : i32
! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
! CHECK: %[[STEP:.*]] = arith.constant 1 : index
+ ! CHECK: omp.parallel {
+
+ ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+ ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
! CHECK: omp.wsloop {
! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir
new file mode 100644
index 0000000000000..b53ecd687c039
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir
@@ -0,0 +1,62 @@
+// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+// CHECK-LABEL: func.func @do_concurrent_basic
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+ // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+
+ %0 = fir.alloca i32 {bindc_name = "i"}
+ %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+ %c10 = arith.constant 10 : index
+ %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ %c1_i32 = arith.constant 1 : i32
+ %7 = fir.convert %c1_i32 : (i32) -> index
+ %c10_i32 = arith.constant 10 : i32
+ %8 = fir.convert %c10_i32 : (i32) -> index
+ %c1 = arith.constant 1 : index
+
+ // CHECK-NOT: fir.do_loop
+
+ // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+ // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+ // CHECK: %[[C10:.*]] = arith.constant 10 : i32
+ // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+ // CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+ // CHECK: omp.parallel {
+
+ // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+ // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+ // CHECK: omp.wsloop {
+ // CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+ // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+ // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+ // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+ // CHECK-NEXT: omp.yield
+ // CHECK-NEXT: }
+ // CHECK-NEXT: }
+
+ // CHECK-NEXT: omp.terminator
+ // CHECK-NEXT: }
+ fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+ %13 = fir.convert %arg0 : (index) -> i32
+ fir.store %13 to %1#1 : !fir.ref<i32>
+ %14 = fir.load %1#0 : !fir.ref<i32>
+ %15 = fir.load %1#0 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> i64
+ %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ }
+
+ // CHECK-NOT: fir.do_loop
+
+ return
+ }
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
new file mode 100644
index 0000000000000..48e0afe6752b6
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -0,0 +1,45 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN: | FileCheck %s
+
+program main
+ implicit none
+
+ call foo(10)
+
+ contains
+ subroutine foo(n)
+ implicit none
+ integer :: n
+ integer :: i
+ integer, dimension(n) :: a
+
+ do concurrent(i=1:n)
+ a(i) = i
+ end do
+ end subroutine
+
+end program main
+
+! CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFFfooEn"}
+
+! CHECK: fir.load
+
+! CHECK: %[[LB:.*]] = fir.convert %{{c1_.*}} : (i32) -> index
+! CHECK: %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[UB:.*]] = fir.convert %[[N_VAL]] : (i32) -> index
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+
+! CHECK: omp.parallel {
+
+
+! Verify that we restort to using the outside value for the upper bound since it
+! is not originally a constant.
+
+! CHECK: omp.wsloop {
+! CHECK: omp.loop_nest (%{{.*}}) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%{{.*}}) {
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
+
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
new file mode 100644
index 0000000000000..e330714d2db8e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -0,0 +1,45 @@
+! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
+! we skip converting the not-perfectly nested `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN: | FileCheck %s
+
+program main
+ integer, parameter :: n = 10
+ integer, parameter :: m = 20
+ integer, parameter :: l = 30
+ integer x;
+ integer :: a(n, m, l)
+
+ do concurrent(i=1:n)
+ x = 10
+ do concurrent(j=1:m, k=1:l)
+ a(i,j,k) = i * j + k
+ end do
+ end do
+end
+
+! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+
+! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! CHECK: omp.parallel {
+
+! CHECK: omp.wsloop {
+! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! CHECK: fir.do_loop %[[J_IV:.*]] = {{.*}} {
+! CHECK: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! CHECK: fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#1
+
+! CHECK: fir.do_loop %[[K_IV:.*]] = {{.*}} {
+! CHECK: %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! CHECK: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#1
+! CHECK: }
+! CHECK: }
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }
>From 2720590e255be27ab58a3e52f69fc5f1fba8d048 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 4 Mar 2025 02:18:27 -0600
Subject: [PATCH 2/3] Handle review comments
---
.../OpenMP/DoConcurrentConversion.cpp | 62 +++++--------------
.../Transforms/DoConcurrent/basic_device.mlir | 29 +++++++++
2 files changed, 45 insertions(+), 46 deletions(-)
create mode 100644 flang/test/Transforms/DoConcurrent/basic_device.mlir
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 4b9024e148621..7ad2229df31ce 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -35,39 +35,6 @@ struct InductionVariableInfo {
using LoopNestToIndVarMap =
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
-/// Given an operation `op`, this returns true if one of `op`'s operands is
-/// "ultimately" the loop's induction variable. This helps in cases where the
-/// induction variable's use is "hidden" behind a convert/cast.
-///
-/// For example, give the following loop:
-/// ```
-/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
-/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
-/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
-/// ...
-/// }
-/// ```
-///
-/// If \p op is the `fir.store` operation, then this function will return true
-/// since the IV is the "ultimate" operand to the `fir.store` op through the
-/// `%ind_var_conv` -> `%ind_var` conversion sequence.
-///
-/// For why this is useful, see its use in `findLoopIndVarMemDecl`.
-bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
- while (op != nullptr && op->getNumOperands() > 0) {
- auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) {
- return operand == doLoop.getInductionVar();
- });
-
- if (ivIt != op->getOperands().end())
- return true;
-
- op = op->getOperand(0).getDefiningOp();
- }
-
- return false;
-}
-
/// For the \p doLoop parameter, find the operation that declares its iteration
/// variable or allocates memory for it.
///
@@ -84,19 +51,20 @@ bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
/// ```
///
/// This function returns the `hlfir.declare` op for `%i`.
+///
+/// Note: The current implementation is dependent on how flang emits loop
+/// bodies; which is sufficient for the current simple test/use cases. If this
+/// proves to be insufficient, this should be made more generic.
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
mlir::Value result = nullptr;
- mlir::visitUsedValuesDefinedAbove(
- doLoop.getRegion(), [&](mlir::OpOperand *operand) {
- if (result)
- return;
-
- if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
- assert(result == nullptr &&
- "loop can have only one induction variable");
- result = operand->get();
- }
- });
+ for (mlir::Operation &op : doLoop) {
+ // The first `fir.store` op we come across should be the op that updates the
+ // loop's iteration variable.
+ if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) {
+ result = storeOp.getMemref();
+ break;
+ }
+ }
assert(result != nullptr && result.getDefiningOp() != nullptr);
return result.getDefiningOp();
@@ -239,6 +207,10 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
mlir::LogicalResult
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
+ if (mapToDevice)
+ return doLoop.emitError(
+ "not yet implemented: Mapping `do concurrent` loops to device");
+
looputils::LoopNestToIndVarMap loopNest;
bool hasRemainingNestedLoops =
failed(looputils::collectLoopNest(doLoop, loopNest));
@@ -407,8 +379,6 @@ class DoConcurrentConversionPass
if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
std::move(patterns)))) {
- mlir::emitError(mlir::UnknownLoc::get(context),
- "error in converting do-concurrent op");
signalPassFailure();
}
}
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
new file mode 100644
index 0000000000000..d7fcc40e4a7f9
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -0,0 +1,29 @@
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
+
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+ %0 = fir.alloca i32 {bindc_name = "i"}
+ %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+ %c10 = arith.constant 10 : index
+ %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ %c1_i32 = arith.constant 1 : i32
+ %7 = fir.convert %c1_i32 : (i32) -> index
+ %c10_i32 = arith.constant 10 : i32
+ %8 = fir.convert %c10_i32 : (i32) -> index
+ %c1 = arith.constant 1 : index
+
+ // expected-error at +2 {{not yet implemented: Mapping `do concurrent` loops to device}}
+ // expected-error at below {{failed to legalize operation 'fir.do_loop'}}
+ fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+ %13 = fir.convert %arg0 : (index) -> i32
+ fir.store %13 to %1#1 : !fir.ref<i32>
+ %14 = fir.load %1#0 : !fir.ref<i32>
+ %15 = fir.load %1#0 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> i64
+ %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ }
+
+ return
+ }
>From 0243c4f32d3a863d1989c2c84f17b086ef562fcf Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 11 Mar 2025 00:05:55 -0500
Subject: [PATCH 3/3] handle review comments
---
.../OpenMP/DoConcurrentConversion.cpp | 35 ++++++++++++++-----
.../DoConcurrent/non_const_bounds.f90 | 2 +-
2 files changed, 27 insertions(+), 10 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 7ad2229df31ce..bb535cf8aba03 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -28,7 +28,7 @@ namespace looputils {
/// Stores info needed about the induction/iteration variable for each `do
/// concurrent` in a loop nest.
struct InductionVariableInfo {
- /// the operation allocating memory for iteration variable,
+ /// The operation allocating memory for iteration variable.
mlir::Operation *iterVarMemDef;
};
@@ -57,13 +57,30 @@ using LoopNestToIndVarMap =
/// proves to be insufficient, this should be made more generic.
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
mlir::Value result = nullptr;
- for (mlir::Operation &op : doLoop) {
- // The first `fir.store` op we come across should be the op that updates the
- // loop's iteration variable.
- if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) {
- result = storeOp.getMemref();
- break;
+
+ // Checks if a StoreOp is updating the memref of the loop's iteration
+ // variable.
+ auto isStoringIV = [&](fir::StoreOp storeOp) {
+ // Direct store into the IV memref.
+ if (storeOp.getValue() == doLoop.getInductionVar())
+ return true;
+
+ // Indirect store into the IV memref.
+ if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
+ storeOp.getValue().getDefiningOp())) {
+ if (convertOp.getOperand() == doLoop.getInductionVar())
+ return true;
}
+
+ return false;
+ };
+
+ for (mlir::Operation &op : doLoop) {
+ if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
+ if (isStoringIV(storeOp)) {
+ result = storeOp.getMemref();
+ break;
+ }
}
assert(result != nullptr && result.getDefiningOp() != nullptr);
@@ -291,8 +308,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
"Loop nest bounds were already emitted!");
- auto populateBounds = [&](mlir::Value var,
- llvm::SmallVectorImpl<mlir::Value> &bounds) {
+ auto populateBounds = [](mlir::Value var,
+ llvm::SmallVectorImpl<mlir::Value> &bounds) {
bounds.push_back(var.getDefiningOp()->getResult(0));
};
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index 48e0afe6752b6..cd1bd4f98a3f5 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -32,7 +32,7 @@ end program main
! CHECK: omp.parallel {
-! Verify that we restort to using the outside value for the upper bound since it
+! Verify that we resort to using the outside value for the upper bound since it
! is not originally a constant.
! CHECK: omp.wsloop {
More information about the flang-commits
mailing list