[flang-commits] [flang] [flang][OpenMP] Map simple `do concurrent` loops to OpenMP host constructs (PR #127633)

Wed Apr 2 01:14:25 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127633

>From 20619fdbef2d6b1a607059d8e9d2017eea9e6ad6 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 18 Feb 2025 02:50:46 -0600
Subject: [PATCH 1/3] [flang][OpenMP] Map simple `do concurrent` loops to
 OpenMP host constructs

Upstreams one more part of the ROCm `do concurrent` to OpenMP mapping
pass. This PR add support for converting simple loops to the equivalent
OpenMP constructs on the host: `omp parallel do`. Towards that end, we
have to collect more information about loop nests for which we add new
utils in the `looputils` name space.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  47 ++++
 .../OpenMP/DoConcurrentConversion.cpp         | 211 +++++++++++++++++-
 .../Transforms/DoConcurrent/basic_host.f90    |  14 +-
 .../Transforms/DoConcurrent/basic_host.mlir   |  62 +++++
 .../DoConcurrent/non_const_bounds.f90         |  45 ++++
 .../DoConcurrent/not_perfectly_nested.f90     |  45 ++++
 6 files changed, 405 insertions(+), 19 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/basic_host.mlir
 create mode 100644 flang/test/Transforms/DoConcurrent/non_const_bounds.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md
index 7b49af742f242..19611615ee9d6 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -126,6 +126,53 @@ see the "Data environment" section below.
 See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
 of what is and is not detected as a perfect loop nest.
 
+### Single-range loops
+
+Given the following loop:
+```fortran
+  do concurrent(i=1:n)
+    a(i) = i * i
+  end do
+```
+
+#### Mapping to `host`
+
+Mapping this loop to the `host`, generates MLIR operations of the following
+structure:
+
+```
+%4 = fir.address_of(@_QFEa) ...
+%6:2 = hlfir.declare %4 ...
+
+omp.parallel {
+  // Allocate private copy for `i`.
+  // TODO Use delayed privatization.
+  %19 = fir.alloca i32 {bindc_name = "i"}
+  %20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...
+
+  omp.wsloop {
+    omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
+      %23 = fir.convert %arg0 : (index) -> i32
+      // Use the privatized version of `i`.
+      fir.store %23 to %20#1 : !fir.ref<i32>
+      ...
+
+      // Use "shared" SSA value of `a`.
+      %42 = hlfir.designate %6#0
+      hlfir.assign %35 to %42
+      ...
+      omp.yield
+    }
+    omp.terminator
+  }
+  omp.terminator
+}
+```
+
+#### Mapping to `device`
+
+<!-- TODO -->
+
 <!--
 More details about current status will be added along with relevant parts of the
 implementation in later upstreaming patches.
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index ad88b42ac6d7a..4b9024e148621 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -11,6 +11,7 @@
 #include "flang/Optimizer/OpenMP/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
@@ -24,7 +25,82 @@ namespace flangomp {
 
 namespace {
 namespace looputils {
-using LoopNest = llvm::SetVector<fir::DoLoopOp>;
+/// Stores info needed about the induction/iteration variable for each `do
+/// concurrent` in a loop nest.
+struct InductionVariableInfo {
+  /// the operation allocating memory for iteration variable,
+  mlir::Operation *iterVarMemDef;
+};
+
+using LoopNestToIndVarMap =
+    llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
+
+/// Given an operation `op`, this returns true if one of `op`'s operands is
+/// "ultimately" the loop's induction variable. This helps in cases where the
+/// induction variable's use is "hidden" behind a convert/cast.
+///
+/// For example, give the following loop:
+/// ```
+///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
+///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
+///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
+///     ...
+///   }
+/// ```
+///
+/// If \p op is the `fir.store` operation, then this function will return true
+/// since the IV is the "ultimate" operand to the `fir.store` op through the
+/// `%ind_var_conv` -> `%ind_var` conversion sequence.
+///
+/// For why this is useful, see its use in `findLoopIndVarMemDecl`.
+bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
+  while (op != nullptr && op->getNumOperands() > 0) {
+    auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) {
+      return operand == doLoop.getInductionVar();
+    });
+
+    if (ivIt != op->getOperands().end())
+      return true;
+
+    op = op->getOperand(0).getDefiningOp();
+  }
+
+  return false;
+}
+
+/// For the \p doLoop parameter, find the operation that declares its iteration
+/// variable or allocates memory for it.
+///
+/// For example, give the following loop:
+/// ```
+///   ...
+///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
+///   ...
+///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
+///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
+///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
+///     ...
+///   }
+/// ```
+///
+/// This function returns the `hlfir.declare` op for `%i`.
+mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
+  mlir::Value result = nullptr;
+  mlir::visitUsedValuesDefinedAbove(
+      doLoop.getRegion(), [&](mlir::OpOperand *operand) {
+        if (result)
+          return;
+
+        if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
+          assert(result == nullptr &&
+                 "loop can have only one induction variable");
+          result = operand->get();
+        }
+      });
+
+  assert(result != nullptr && result.getDefiningOp() != nullptr);
+  return result.getDefiningOp();
+}
 
 /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
 /// there are no operations in \p outerloop's body other than:
@@ -116,11 +192,14 @@ bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
 /// fails to recognize a certain nested loop as part of the nest it just returns
 /// the parent loops it discovered before.
 mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
-                                    LoopNest &loopNest) {
+                                    LoopNestToIndVarMap &loopNest) {
   assert(currentLoop.getUnordered());
 
   while (true) {
-    loopNest.insert(currentLoop);
+    loopNest.insert(
+        {currentLoop,
+         InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
+
     llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
 
     for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
@@ -152,13 +231,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 public:
   using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
 
-  DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice)
-      : OpConversionPattern(context), mapToDevice(mapToDevice) {}
+  DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
+                         llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
+      : OpConversionPattern(context), mapToDevice(mapToDevice),
+        concurrentLoopsToSkip(concurrentLoopsToSkip) {}
 
   mlir::LogicalResult
   matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    looputils::LoopNest loopNest;
+    looputils::LoopNestToIndVarMap loopNest;
     bool hasRemainingNestedLoops =
         failed(looputils::collectLoopNest(doLoop, loopNest));
     if (hasRemainingNestedLoops)
@@ -166,12 +247,120 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
                         "Some `do concurent` loops are not perfectly-nested. "
                         "These will be serialized.");
 
-    // TODO This will be filled in with the next PRs that upstreams the rest of
-    // the ROCm implementaion.
+    mlir::IRMapping mapper;
+    genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
+    mlir::omp::LoopNestOperands loopNestClauseOps;
+    genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+                         loopNestClauseOps);
+
+    mlir::omp::LoopNestOp ompLoopNest =
+        genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
+                    /*isComposite=*/mapToDevice);
+
+    rewriter.eraseOp(doLoop);
+
+    // Mark `unordered` loops that are not perfectly nested to be skipped from
+    // the legality check of the `ConversionTarget` since we are not interested
+    // in mapping them to OpenMP.
+    ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
+      if (doLoop.getUnordered()) {
+        concurrentLoopsToSkip.insert(doLoop);
+      }
+    });
+
     return mlir::success();
   }
 
+private:
+  mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
+                                      mlir::ConversionPatternRewriter &rewriter,
+                                      looputils::LoopNestToIndVarMap &loopNest,
+                                      mlir::IRMapping &mapper) const {
+    auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+    rewriter.createBlock(&parallelOp.getRegion());
+    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+
+    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+    return parallelOp;
+  }
+
+  void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
+                               looputils::LoopNestToIndVarMap &loopNest,
+                               mlir::IRMapping &mapper) const {
+
+    for (auto &[_, indVarInfo] : loopNest)
+      genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
+  }
+
+  mlir::Operation *
+  genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
+                            mlir::Operation *indVarMemDef,
+                            mlir::IRMapping &mapper) const {
+    assert(
+        indVarMemDef != nullptr &&
+        "Induction variable memdef is expected to have a defining operation.");
+
+    llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
+    for (auto operand : indVarMemDef->getOperands())
+      indVarDeclareAndAlloc.insert(operand.getDefiningOp());
+    indVarDeclareAndAlloc.insert(indVarMemDef);
+
+    mlir::Operation *result;
+    for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
+      result = rewriter.clone(*opToClone, mapper);
+
+    return result;
+  }
+
+  void genLoopNestClauseOps(
+      mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+      looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
+      mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+    assert(loopNestClauseOps.loopLowerBounds.empty() &&
+           "Loop nest bounds were already emitted!");
+
+    auto populateBounds = [&](mlir::Value var,
+                              llvm::SmallVectorImpl<mlir::Value> &bounds) {
+      bounds.push_back(var.getDefiningOp()->getResult(0));
+    };
+
+    for (auto &[doLoop, _] : loopNest) {
+      populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
+      populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
+      populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
+    }
+
+    loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
+  }
+
+  mlir::omp::LoopNestOp
+  genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
+              mlir::IRMapping &mapper,
+              const mlir::omp::LoopNestOperands &clauseOps,
+              bool isComposite) const {
+
+    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+    wsloopOp.setComposite(isComposite);
+    rewriter.createBlock(&wsloopOp.getRegion());
+
+    auto loopNestOp =
+        rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+
+    // Clone the loop's body inside the loop nest construct using the
+    // mapped values.
+    rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
+                               loopNestOp.getRegion().begin(), mapper);
+
+    mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
+    rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
+    rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
+    rewriter.eraseOp(terminator);
+
+    return loopNestOp;
+  }
+
   bool mapToDevice;
+  llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
 };
 
 class DoConcurrentConversionPass
@@ -200,16 +389,18 @@ class DoConcurrentConversionPass
       return;
     }
 
+    llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
     mlir::RewritePatternSet patterns(context);
     patterns.insert<DoConcurrentConversion>(
-        context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device);
+        context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
+        concurrentLoopsToSkip);
     mlir::ConversionTarget target(*context);
     target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
       // The goal is to handle constructs that eventually get lowered to
       // `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
       // Currently, this is only enabled for the `do concurrent` construct since
       // the pass runs early in the pipeline.
-      return !op.getUnordered();
+      return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
     });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index b569668ab0f0e..b80bc2f2dd038 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,7 +1,3 @@
-! Mark as xfail for now until we upstream the relevant part. This is just for
-! demo purposes at this point. Upstreaming this is the next step.
-! XFAIL: *
-
 ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
@@ -19,17 +15,17 @@ program do_concurrent_basic
 
     ! CHECK-NOT: fir.do_loop
 
-    ! CHECK: omp.parallel {
-
-    ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
-    ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
     ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
     ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
     ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
     ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
     ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
 
+    ! CHECK: omp.parallel {
+
+    ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
     ! CHECK: omp.wsloop {
     ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
     ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir
new file mode 100644
index 0000000000000..b53ecd687c039
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir
@@ -0,0 +1,62 @@
+// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+// CHECK-LABEL: func.func @do_concurrent_basic
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+    // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+
+    %0 = fir.alloca i32 {bindc_name = "i"}
+    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+    %c10 = arith.constant 10 : index
+    %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    %c1_i32 = arith.constant 1 : i32
+    %7 = fir.convert %c1_i32 : (i32) -> index
+    %c10_i32 = arith.constant 10 : i32
+    %8 = fir.convert %c10_i32 : (i32) -> index
+    %c1 = arith.constant 1 : index
+
+    // CHECK-NOT: fir.do_loop
+
+    // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    // CHECK: %[[C10:.*]] = arith.constant 10 : i32
+    // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    // CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+    // CHECK: omp.parallel {
+
+    // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    // CHECK: omp.wsloop {
+    // CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+    // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+    // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+    // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+    // CHECK-NEXT: omp.yield
+    // CHECK-NEXT: }
+    // CHECK-NEXT: }
+
+    // CHECK-NEXT: omp.terminator
+    // CHECK-NEXT: }
+    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+      %13 = fir.convert %arg0 : (index) -> i32
+      fir.store %13 to %1#1 : !fir.ref<i32>
+      %14 = fir.load %1#0 : !fir.ref<i32>
+      %15 = fir.load %1#0 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    }
+
+    // CHECK-NOT: fir.do_loop
+
+    return
+  }
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
new file mode 100644
index 0000000000000..48e0afe6752b6
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -0,0 +1,45 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN:   | FileCheck %s
+
+program main
+    implicit none
+
+    call foo(10)
+
+    contains
+        subroutine foo(n)
+            implicit none
+            integer :: n
+            integer :: i
+            integer, dimension(n) :: a
+
+            do concurrent(i=1:n)
+                a(i) = i
+            end do
+        end subroutine
+
+end program main
+
+! CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFFfooEn"}
+
+! CHECK: fir.load
+
+! CHECK: %[[LB:.*]] = fir.convert %{{c1_.*}} : (i32) -> index
+! CHECK: %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[UB:.*]] = fir.convert %[[N_VAL]] : (i32) -> index
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+
+! CHECK: omp.parallel {
+
+
+! Verify that we restort to using the outside value for the upper bound since it
+! is not originally a constant.
+
+! CHECK:   omp.wsloop {
+! CHECK:     omp.loop_nest (%{{.*}}) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%{{.*}}) {
+! CHECK:       omp.yield
+! CHECK:     }
+! CHECK:   }
+! CHECK:   omp.terminator
+! CHECK: }
+
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
new file mode 100644
index 0000000000000..e330714d2db8e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -0,0 +1,45 @@
+! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
+! we skip converting the not-perfectly nested `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN:   | FileCheck %s
+
+program main
+   integer, parameter :: n = 10
+   integer, parameter :: m = 20
+   integer, parameter :: l = 30
+   integer x;
+   integer :: a(n, m, l)
+
+   do concurrent(i=1:n)
+     x = 10
+     do concurrent(j=1:m, k=1:l)
+       a(i,j,k) = i * j + k
+     end do
+   end do
+end
+
+! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+
+! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! CHECK: omp.parallel {
+
+! CHECK: omp.wsloop {
+! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! CHECK:   fir.do_loop %[[J_IV:.*]] = {{.*}} {
+! CHECK:     %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! CHECK:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#1
+
+! CHECK:     fir.do_loop %[[K_IV:.*]] = {{.*}} {
+! CHECK:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! CHECK:         fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#1
+! CHECK:     }
+! CHECK:   }
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: }
+! CHECK: omp.terminator
+! CHECK: }

>From 2720590e255be27ab58a3e52f69fc5f1fba8d048 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 4 Mar 2025 02:18:27 -0600
Subject: [PATCH 2/3] Handle review comments

---
 .../OpenMP/DoConcurrentConversion.cpp         | 62 +++++--------------
 .../Transforms/DoConcurrent/basic_device.mlir | 29 +++++++++
 2 files changed, 45 insertions(+), 46 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/basic_device.mlir

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 4b9024e148621..7ad2229df31ce 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -35,39 +35,6 @@ struct InductionVariableInfo {
 using LoopNestToIndVarMap =
     llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
 
-/// Given an operation `op`, this returns true if one of `op`'s operands is
-/// "ultimately" the loop's induction variable. This helps in cases where the
-/// induction variable's use is "hidden" behind a convert/cast.
-///
-/// For example, give the following loop:
-/// ```
-///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
-///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
-///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
-///     ...
-///   }
-/// ```
-///
-/// If \p op is the `fir.store` operation, then this function will return true
-/// since the IV is the "ultimate" operand to the `fir.store` op through the
-/// `%ind_var_conv` -> `%ind_var` conversion sequence.
-///
-/// For why this is useful, see its use in `findLoopIndVarMemDecl`.
-bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
-  while (op != nullptr && op->getNumOperands() > 0) {
-    auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) {
-      return operand == doLoop.getInductionVar();
-    });
-
-    if (ivIt != op->getOperands().end())
-      return true;
-
-    op = op->getOperand(0).getDefiningOp();
-  }
-
-  return false;
-}
-
 /// For the \p doLoop parameter, find the operation that declares its iteration
 /// variable or allocates memory for it.
 ///
@@ -84,19 +51,20 @@ bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
 /// ```
 ///
 /// This function returns the `hlfir.declare` op for `%i`.
+///
+/// Note: The current implementation is dependent on how flang emits loop
+/// bodies; which is sufficient for the current simple test/use cases. If this
+/// proves to be insufficient, this should be made more generic.
 mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
   mlir::Value result = nullptr;
-  mlir::visitUsedValuesDefinedAbove(
-      doLoop.getRegion(), [&](mlir::OpOperand *operand) {
-        if (result)
-          return;
-
-        if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
-          assert(result == nullptr &&
-                 "loop can have only one induction variable");
-          result = operand->get();
-        }
-      });
+  for (mlir::Operation &op : doLoop) {
+    // The first `fir.store` op we come across should be the op that updates the
+    // loop's iteration variable.
+    if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) {
+      result = storeOp.getMemref();
+      break;
+    }
+  }
 
   assert(result != nullptr && result.getDefiningOp() != nullptr);
   return result.getDefiningOp();
@@ -239,6 +207,10 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+    if (mapToDevice)
+      return doLoop.emitError(
+          "not yet implemented: Mapping `do concurrent` loops to device");
+
     looputils::LoopNestToIndVarMap loopNest;
     bool hasRemainingNestedLoops =
         failed(looputils::collectLoopNest(doLoop, loopNest));
@@ -407,8 +379,6 @@ class DoConcurrentConversionPass
 
     if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
                                                std::move(patterns)))) {
-      mlir::emitError(mlir::UnknownLoc::get(context),
-                      "error in converting do-concurrent op");
       signalPassFailure();
     }
   }
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
new file mode 100644
index 0000000000000..d7fcc40e4a7f9
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -0,0 +1,29 @@
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
+
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+    %0 = fir.alloca i32 {bindc_name = "i"}
+    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+    %c10 = arith.constant 10 : index
+    %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    %c1_i32 = arith.constant 1 : i32
+    %7 = fir.convert %c1_i32 : (i32) -> index
+    %c10_i32 = arith.constant 10 : i32
+    %8 = fir.convert %c10_i32 : (i32) -> index
+    %c1 = arith.constant 1 : index
+
+    // expected-error at +2 {{not yet implemented: Mapping `do concurrent` loops to device}}
+    // expected-error at below {{failed to legalize operation 'fir.do_loop'}}
+    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+      %13 = fir.convert %arg0 : (index) -> i32
+      fir.store %13 to %1#1 : !fir.ref<i32>
+      %14 = fir.load %1#0 : !fir.ref<i32>
+      %15 = fir.load %1#0 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    }
+
+    return
+  }

>From 0243c4f32d3a863d1989c2c84f17b086ef562fcf Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 11 Mar 2025 00:05:55 -0500
Subject: [PATCH 3/3] handle review comments

---
 .../OpenMP/DoConcurrentConversion.cpp         | 35 ++++++++++++++-----
 .../DoConcurrent/non_const_bounds.f90         |  2 +-
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 7ad2229df31ce..bb535cf8aba03 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -28,7 +28,7 @@ namespace looputils {
 /// Stores info needed about the induction/iteration variable for each `do
 /// concurrent` in a loop nest.
 struct InductionVariableInfo {
-  /// the operation allocating memory for iteration variable,
+  /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
 };
 
@@ -57,13 +57,30 @@ using LoopNestToIndVarMap =
 /// proves to be insufficient, this should be made more generic.
 mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
   mlir::Value result = nullptr;
-  for (mlir::Operation &op : doLoop) {
-    // The first `fir.store` op we come across should be the op that updates the
-    // loop's iteration variable.
-    if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) {
-      result = storeOp.getMemref();
-      break;
+
+  // Checks if a StoreOp is updating the memref of the loop's iteration
+  // variable.
+  auto isStoringIV = [&](fir::StoreOp storeOp) {
+    // Direct store into the IV memref.
+    if (storeOp.getValue() == doLoop.getInductionVar())
+      return true;
+
+    // Indirect store into the IV memref.
+    if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
+            storeOp.getValue().getDefiningOp())) {
+      if (convertOp.getOperand() == doLoop.getInductionVar())
+        return true;
     }
+
+    return false;
+  };
+
+  for (mlir::Operation &op : doLoop) {
+    if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
+      if (isStoringIV(storeOp)) {
+        result = storeOp.getMemref();
+        break;
+      }
   }
 
   assert(result != nullptr && result.getDefiningOp() != nullptr);
@@ -291,8 +308,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");
 
-    auto populateBounds = [&](mlir::Value var,
-                              llvm::SmallVectorImpl<mlir::Value> &bounds) {
+    auto populateBounds = [](mlir::Value var,
+                             llvm::SmallVectorImpl<mlir::Value> &bounds) {
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };
 
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index 48e0afe6752b6..cd1bd4f98a3f5 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -32,7 +32,7 @@ end program main
 ! CHECK: omp.parallel {
 
 
-! Verify that we restort to using the outside value for the upper bound since it
+! Verify that we resort to using the outside value for the upper bound since it
 ! is not originally a constant.
 
 ! CHECK:   omp.wsloop {