[llvm-branch-commits] [flang] [flang][OpenMP] Update `do concurrent` mapping pass to use `fir.do_concurrent` op (PR #138489)

Mon May 5 01:30:24 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/138489

>From ac0cde7576c49a00e3591254d6891879b52bee81 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 5 May 2025 02:23:04 -0500
Subject: [PATCH] [flang][OpenMP] Update `do concurrent` mapping pass to use
 `fir.do_concurrent` op

This PR updates the `do concurrent` to OpenMP mapping pass to use the newly added `fir.do_concurrent` ops that were recently added upstream instead of handling nests of `fir.do_loop ... unordered` ops.

Parent PR: https://github.com/llvm/llvm-project/pull/137928.
---
 .../OpenMP/DoConcurrentConversion.cpp         | 362 ++++--------------
 .../Transforms/DoConcurrent/basic_device.mlir |  24 +-
 .../Transforms/DoConcurrent/basic_host.f90    |   3 -
 .../Transforms/DoConcurrent/basic_host.mlir   |  26 +-
 .../DoConcurrent/locally_destroyed_temp.f90   |   3 -
 .../DoConcurrent/loop_nest_test.f90           |  92 -----
 .../multiple_iteration_ranges.f90             |   3 -
 .../DoConcurrent/non_const_bounds.f90         |   3 -
 .../DoConcurrent/not_perfectly_nested.f90     |  24 +-
 9 files changed, 122 insertions(+), 418 deletions(-)
 delete mode 100644 flang/test/Transforms/DoConcurrent/loop_nest_test.f90

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 2c069860ffdca..0fdb302fe10ca 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
@@ -28,8 +29,10 @@ namespace looputils {
 /// Stores info needed about the induction/iteration variable for each `do
 /// concurrent` in a loop nest.
 struct InductionVariableInfo {
-  InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
-
+  InductionVariableInfo(fir::DoConcurrentLoopOp loop,
+                        mlir::Value inductionVar) {
+    populateInfo(loop, inductionVar);
+  }
   /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
   /// the operation(s) updating the iteration variable with the current
@@ -45,7 +48,7 @@ struct InductionVariableInfo {
   ///   ...
   ///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
   ///   ...
-  ///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
+  ///   fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
   ///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
   ///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
   ///     ...
@@ -62,14 +65,14 @@ struct InductionVariableInfo {
   /// Note: The current implementation is dependent on how flang emits loop
   /// bodies; which is sufficient for the current simple test/use cases. If this
   /// proves to be insufficient, this should be made more generic.
-  void populateInfo(fir::DoLoopOp doLoop) {
+  void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
     mlir::Value result = nullptr;
 
     // Checks if a StoreOp is updating the memref of the loop's iteration
     // variable.
     auto isStoringIV = [&](fir::StoreOp storeOp) {
       // Direct store into the IV memref.
-      if (storeOp.getValue() == doLoop.getInductionVar()) {
+      if (storeOp.getValue() == inductionVar) {
         indVarUpdateOps.push_back(storeOp);
         return true;
       }
@@ -77,7 +80,7 @@ struct InductionVariableInfo {
       // Indirect store into the IV memref.
       if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
               storeOp.getValue().getDefiningOp())) {
-        if (convertOp.getOperand() == doLoop.getInductionVar()) {
+        if (convertOp.getOperand() == inductionVar) {
           indVarUpdateOps.push_back(convertOp);
           indVarUpdateOps.push_back(storeOp);
           return true;
@@ -87,7 +90,7 @@ struct InductionVariableInfo {
       return false;
     };
 
-    for (mlir::Operation &op : doLoop) {
+    for (mlir::Operation &op : loop) {
       if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
         if (isStoringIV(storeOp)) {
           result = storeOp.getMemref();
@@ -100,219 +103,7 @@ struct InductionVariableInfo {
   }
 };
 
-using LoopNestToIndVarMap =
-    llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
-
-/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
-/// there are no operations in \p outerloop's body other than:
-///
-/// 1. the operations needed to assign/update \p outerLoop's induction variable.
-/// 2. \p innerLoop itself.
-///
-/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
-/// according to the above definition.
-bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
-  mlir::ForwardSliceOptions forwardSliceOptions;
-  forwardSliceOptions.inclusive = true;
-  // The following will be used as an example to clarify the internals of this
-  // function:
-  // ```
-  // 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
-  // 2.   %i_idx_2 = fir.convert %i_idx : (index) -> i32
-  // 3.   fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
-  //
-  // 4.   fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
-  // 5.     %j_idx_2 = fir.convert %j_idx : (index) -> i32
-  // 6.     fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
-  //        ... loop nest body, possible uses %i_idx ...
-  //      }
-  //    }
-  // ```
-  // In this example, the `j` loop is perfectly nested inside the `i` loop and
-  // below is how we find that.
-
-  // We don't care about the outer-loop's induction variable's uses within the
-  // inner-loop, so we filter out these uses.
-  //
-  // This filter tells `getForwardSlice` (below) to only collect operations
-  // which produce results defined above (i.e. outside) the inner-loop's body.
-  //
-  // Since `outerLoop.getInductionVar()` is a block argument (to the
-  // outer-loop's body), the filter effectively collects uses of
-  // `outerLoop.getInductionVar()` inside the outer-loop but outside the
-  // inner-loop.
-  forwardSliceOptions.filter = [&](mlir::Operation *op) {
-    return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
-  };
-
-  llvm::SetVector<mlir::Operation *> indVarSlice;
-  // The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
-  // above. Uses of `%i_idx` inside the `j` loop are not collected because of
-  // the filter.
-  mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
-                        forwardSliceOptions);
-  llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
-                                              indVarSlice.end());
-
-  llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
-  // The following walk collects ops inside `outerLoop` that are **not**:
-  // * the outer-loop itself,
-  // * or the inner-loop,
-  // * or the `fir.result` op (the outer-loop's terminator).
-  //
-  // For the above example, this will also populate `outerLoopBodySet` with ops
-  // in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
-  outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
-    if (op == outerLoop)
-      return mlir::WalkResult::advance();
-
-    if (op == innerLoop)
-      return mlir::WalkResult::skip();
-
-    if (mlir::isa<fir::ResultOp>(op))
-      return mlir::WalkResult::advance();
-
-    outerLoopBodySet.insert(op);
-    return mlir::WalkResult::advance();
-  });
-
-  // If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
-  // `outerLoop` only contains ops that setup its induction variable +
-  // `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
-  // perfectly nested inside `outerLoop`.
-  bool result = (outerLoopBodySet == indVarSet);
-  LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
-                    << " is" << (result ? "" : " not")
-                    << " perfectly nested\n");
-
-  return result;
-}
-
-/// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
-/// This function collects as much as possible loops in the nest; it case it
-/// fails to recognize a certain nested loop as part of the nest it just returns
-/// the parent loops it discovered before.
-mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
-                                    LoopNestToIndVarMap &loopNest) {
-  assert(currentLoop.getUnordered());
-
-  while (true) {
-    loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
-    llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
-
-    for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
-      if (nestedLoop.getUnordered())
-        unorderedLoops.push_back(nestedLoop);
-
-    if (unorderedLoops.empty())
-      break;
-
-    // Having more than one unordered loop means that we are not dealing with a
-    // perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
-    // case we are after here.
-    if (unorderedLoops.size() > 1)
-      return mlir::failure();
-
-    fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();
-
-    if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
-      return mlir::failure();
-
-    currentLoop = nestedUnorderedLoop;
-  }
-
-  return mlir::success();
-}
-
-/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
-/// particular, this function would take this input IR:
-/// ```
-/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
-///   fir.store %i_iv to %i#1 : !fir.ref<i32>
-///   %j_lb = arith.constant 1 : i32
-///   %j_ub = arith.constant 10 : i32
-///   %j_step = arith.constant 1 : index
-///
-///   fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
-///     fir.store %j_iv to %j#1 : !fir.ref<i32>
-///     ...
-///   }
-/// }
-/// ```
-///
-/// into the following form (using generic op form since the result is
-/// technically an invalid `fir.do_loop` op:
-///
-/// ```
-/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
-/// ^bb0(%i_iv: index):
-///   %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
-///   %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
-///   %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
-///
-///   "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
-///   ^bb0(%new_i_iv: index, %new_j_iv: index):
-///     "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
-///     "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
-///     ...
-///   })
-/// ```
-///
-/// What happened to the loop nest is the following:
-///
-/// * the innermost loop's entry block was updated from having one operand to
-///   having `n` operands where `n` is the number of loops in the nest,
-///
-/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
-///   loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
-///
-/// * the innermost loop's entry block's arguments were mapped in order from the
-///   outermost to the innermost IV.
-///
-/// With this IR change, we can directly inline the innermost loop's region into
-/// the newly generated `omp.loop_nest` op.
-///
-/// Note that this function has a pre-condition that \p loopNest consists of
-/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
-/// loops except for the ops to setup the inner loop's LB, UB, and step. These
-/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
-void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
-                    looputils::LoopNestToIndVarMap &loopNest) {
-  if (loopNest.size() <= 1)
-    return;
-
-  fir::DoLoopOp innermostLoop = loopNest.back().first;
-  mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
-
-  llvm::SmallVector<mlir::Type> argTypes;
-  llvm::SmallVector<mlir::Location> argLocs;
-
-  for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
-    // Sink the IV update ops to the innermost loop. We need to do for all loops
-    // except for the innermost one, hence the `drop_end` usage above.
-    for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
-      op->moveBefore(&innermostFirstOp);
-
-    argTypes.push_back(doLoop.getInductionVar().getType());
-    argLocs.push_back(doLoop.getInductionVar().getLoc());
-  }
-
-  mlir::Region &innermmostRegion = innermostLoop.getRegion();
-  // Extend the innermost entry block with arguments to represent the outer IVs.
-  innermmostRegion.addArguments(argTypes, argLocs);
-
-  unsigned idx = 1;
-  // In reverse, remap the IVs of the loop nest from the old values to the new
-  // ones. We do that in reverse since the first argument before this loop is
-  // the old IV for the innermost loop. Therefore, we want to replace it first
-  // before the old value (1st argument in the block) is remapped to be the IV
-  // of the outermost loop in the nest.
-  for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
-    doLoop.getInductionVar().replaceAllUsesWith(
-        innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
-    ++idx;
-  }
-}
+using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
 
 /// Collects values that are local to a loop: "loop-local values". A loop-local
 /// value is one that is used exclusively inside the loop but allocated outside
@@ -326,9 +117,9 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
 /// used exclusively inside.
 ///
 /// \param [out] locals - the list of loop-local values detected for \p doLoop.
-void collectLoopLocalValues(fir::DoLoopOp doLoop,
+void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
                             llvm::SetVector<mlir::Value> &locals) {
-  doLoop.walk([&](mlir::Operation *op) {
+  loop.walk([&](mlir::Operation *op) {
     for (mlir::Value operand : op->getOperands()) {
       if (locals.contains(operand))
         continue;
@@ -340,11 +131,11 @@ void collectLoopLocalValues(fir::DoLoopOp doLoop,
 
       // Values defined inside the loop are not interesting since they do not
       // need to be localized.
-      if (doLoop->isAncestor(operand.getDefiningOp()))
+      if (loop->isAncestor(operand.getDefiningOp()))
         continue;
 
       for (auto *user : operand.getUsers()) {
-        if (!doLoop->isAncestor(user)) {
+        if (!loop->isAncestor(user)) {
           isLocal = false;
           break;
         }
@@ -373,39 +164,42 @@ static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
 }
 } // namespace looputils
 
-class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+class DoConcurrentConversion
+    : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
 public:
-  using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+  using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
 
-  DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
-                         llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
+  DoConcurrentConversion(
+      mlir::MLIRContext *context, bool mapToDevice,
+      llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip)
       : OpConversionPattern(context), mapToDevice(mapToDevice),
         concurrentLoopsToSkip(concurrentLoopsToSkip) {}
 
   mlir::LogicalResult
-  matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+  matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     if (mapToDevice)
       return doLoop.emitError(
           "not yet implemented: Mapping `do concurrent` loops to device");
 
-    looputils::LoopNestToIndVarMap loopNest;
-    bool hasRemainingNestedLoops =
-        failed(looputils::collectLoopNest(doLoop, loopNest));
-    if (hasRemainingNestedLoops)
-      mlir::emitWarning(doLoop.getLoc(),
-                        "Some `do concurent` loops are not perfectly-nested. "
-                        "These will be serialized.");
+    looputils::InductionVariableInfos ivInfos;
+    auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
+        doLoop.getRegion().back().getTerminator());
+
+    auto indVars = loop.getLoopInductionVars();
+    assert(indVars.has_value());
+
+    for (mlir::Value indVar : *indVars)
+      ivInfos.emplace_back(loop, indVar);
 
     llvm::SetVector<mlir::Value> locals;
-    looputils::collectLoopLocalValues(loopNest.back().first, locals);
-    looputils::sinkLoopIVArgs(rewriter, loopNest);
+    looputils::collectLoopLocalValues(loop, locals);
 
     mlir::IRMapping mapper;
     mlir::omp::ParallelOp parallelOp =
-        genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
+        genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
     mlir::omp::LoopNestOperands loopNestClauseOps;
-    genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+    genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
                          loopNestClauseOps);
 
     for (mlir::Value local : locals)
@@ -413,41 +207,56 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
                                         rewriter);
 
     mlir::omp::LoopNestOp ompLoopNest =
-        genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
+        genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);
 
-    rewriter.eraseOp(doLoop);
+    rewriter.setInsertionPoint(doLoop);
+    fir::FirOpBuilder builder(
+        rewriter,
+        fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
+
+    // Collect iteration variable(s) allocations so that we can move them
+    // outside the `fir.do_concurrent` wrapper (before erasing it).
+    llvm::SmallVector<mlir::Operation *> opsToMove;
+    for (mlir::Operation &op : llvm::drop_end(doLoop))
+      opsToMove.push_back(&op);
+
+    mlir::Block *allocBlock = builder.getAllocaBlock();
+
+    for (mlir::Operation *op : llvm::reverse(opsToMove)) {
+      rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
+    }
 
     // Mark `unordered` loops that are not perfectly nested to be skipped from
     // the legality check of the `ConversionTarget` since we are not interested
     // in mapping them to OpenMP.
-    ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
-      if (doLoop.getUnordered()) {
-        concurrentLoopsToSkip.insert(doLoop);
-      }
+    ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+      concurrentLoopsToSkip.insert(doLoop);
     });
 
+    rewriter.eraseOp(doLoop);
+
     return mlir::success();
   }
 
 private:
-  mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
-                                      mlir::ConversionPatternRewriter &rewriter,
-                                      looputils::LoopNestToIndVarMap &loopNest,
-                                      mlir::IRMapping &mapper) const {
+  mlir::omp::ParallelOp
+  genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+                looputils::InductionVariableInfos &ivInfos,
+                mlir::IRMapping &mapper) const {
     auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
     rewriter.createBlock(&parallelOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
-    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+    genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
     return parallelOp;
   }
 
   void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
-                               looputils::LoopNestToIndVarMap &loopNest,
+                               looputils::InductionVariableInfos &ivInfos,
                                mlir::IRMapping &mapper) const {
 
-    for (auto &[_, indVarInfo] : loopNest)
+    for (auto &indVarInfo : ivInfos)
       genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
   }
 
@@ -471,10 +280,11 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     return result;
   }
 
-  void genLoopNestClauseOps(
-      mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-      looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
-      mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  void
+  genLoopNestClauseOps(mlir::Location loc,
+                       mlir::ConversionPatternRewriter &rewriter,
+                       fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+                       mlir::omp::LoopNestOperands &loopNestClauseOps) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");
 
@@ -483,43 +293,42 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };
 
-    for (auto &[doLoop, _] : loopNest) {
-      populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
-      populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
-      populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
+    for (auto [lb, ub, st] : llvm::zip_equal(
+             loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
+      populateBounds(lb, loopNestClauseOps.loopLowerBounds);
+      populateBounds(ub, loopNestClauseOps.loopUpperBounds);
+      populateBounds(st, loopNestClauseOps.loopSteps);
     }
 
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
   }
 
   mlir::omp::LoopNestOp
-  genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
-              mlir::IRMapping &mapper,
+  genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
+              fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
 
-    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(loop.getLoc());
     wsloopOp.setComposite(isComposite);
     rewriter.createBlock(&wsloopOp.getRegion());
 
     auto loopNestOp =
-        rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+        rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
 
     // Clone the loop's body inside the loop nest construct using the
     // mapped values.
-    rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
+    rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
                                loopNestOp.getRegion().begin(), mapper);
 
-    mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
-    rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
-    rewriter.eraseOp(terminator);
+    rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
 
     return loopNestOp;
   }
 
   bool mapToDevice;
-  llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
+  llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
 };
 
 class DoConcurrentConversionPass
@@ -548,19 +357,16 @@ class DoConcurrentConversionPass
       return;
     }
 
-    llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
+    llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
     mlir::RewritePatternSet patterns(context);
     patterns.insert<DoConcurrentConversion>(
         context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
         concurrentLoopsToSkip);
     mlir::ConversionTarget target(*context);
-    target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
-      // The goal is to handle constructs that eventually get lowered to
-      // `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
-      // Currently, this is only enabled for the `do concurrent` construct since
-      // the pass runs early in the pipeline.
-      return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
-    });
+    target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
+        [&](fir::DoConcurrentOp op) {
+          return concurrentLoopsToSkip.contains(op);
+        });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
 
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
index d7fcc40e4a7f9..0ca48943864c8 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -1,8 +1,6 @@
 // RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
 
 func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
-    %0 = fir.alloca i32 {bindc_name = "i"}
-    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
     %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
     %c10 = arith.constant 10 : index
     %3 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -14,15 +12,19 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
     %c1 = arith.constant 1 : index
 
     // expected-error at +2 {{not yet implemented: Mapping `do concurrent` loops to device}}
-    // expected-error at below {{failed to legalize operation 'fir.do_loop'}}
-    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
-      %13 = fir.convert %arg0 : (index) -> i32
-      fir.store %13 to %1#1 : !fir.ref<i32>
-      %14 = fir.load %1#0 : !fir.ref<i32>
-      %15 = fir.load %1#0 : !fir.ref<i32>
-      %16 = fir.convert %15 : (i32) -> i64
-      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
-      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    // expected-error at below {{failed to legalize operation 'fir.do_concurrent'}}
+    fir.do_concurrent {
+      %0 = fir.alloca i32 {bindc_name = "i"}
+      %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) {
+        %13 = fir.convert %arg0 : (index) -> i32
+        fir.store %13 to %1#1 : !fir.ref<i32>
+        %14 = fir.load %1#0 : !fir.ref<i32>
+        %15 = fir.load %1#0 : !fir.ref<i32>
+        %16 = fir.convert %15 : (i32) -> i64
+        %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+        hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+      }
     }
 
     return
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index b84d4481ac766..12f63031cbaee 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir
index b53ecd687c039..5425829404d7b 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir
@@ -6,8 +6,6 @@
 func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
     // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 
-    %0 = fir.alloca i32 {bindc_name = "i"}
-    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
     %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
     %c10 = arith.constant 10 : index
     %3 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -18,7 +16,7 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
     %8 = fir.convert %c10_i32 : (i32) -> index
     %c1 = arith.constant 1 : index
 
-    // CHECK-NOT: fir.do_loop
+    // CHECK-NOT: fir.do_concurrent
 
     // CHECK: %[[C1:.*]] = arith.constant 1 : i32
     // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
@@ -46,17 +44,21 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
 
     // CHECK-NEXT: omp.terminator
     // CHECK-NEXT: }
-    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
-      %13 = fir.convert %arg0 : (index) -> i32
-      fir.store %13 to %1#1 : !fir.ref<i32>
-      %14 = fir.load %1#0 : !fir.ref<i32>
-      %15 = fir.load %1#0 : !fir.ref<i32>
-      %16 = fir.convert %15 : (i32) -> i64
-      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
-      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    fir.do_concurrent {
+      %0 = fir.alloca i32 {bindc_name = "i"}
+      %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) {
+        %13 = fir.convert %arg0 : (index) -> i32
+        fir.store %13 to %1#1 : !fir.ref<i32>
+        %14 = fir.load %1#0 : !fir.ref<i32>
+        %15 = fir.load %1#0 : !fir.ref<i32>
+        %16 = fir.convert %15 : (i32) -> i64
+        %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+        hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+      }
     }
 
-    // CHECK-NOT: fir.do_loop
+    // CHECK-NOT: fir.do_concurrent
 
     return
   }
diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index 4e13c0919589a..f82696669eca6 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests that "loop-local values" are properly handled by localizing them to the
 ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue`
 ! for a definition of "loop-local values" and how they are handled.
diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
deleted file mode 100644
index adc4a488d1ec9..0000000000000
--- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
+++ /dev/null
@@ -1,92 +0,0 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
-! Tests loop-nest detection algorithm for do-concurrent mapping.
-
-! REQUIRES: asserts
-
-! RUN: %flang_fc1 -emit-hlfir  -fopenmp -fdo-concurrent-to-openmp=host \
-! RUN:   -mmlir -debug -mmlir -mlir-disable-threading %s -o - 2> %t.log || true
-
-! RUN: FileCheck %s < %t.log
-
-program main
-  implicit none
-
-contains
-
-subroutine foo(n)
-  implicit none
-  integer :: n, m
-  integer :: i, j, k
-  integer :: x
-  integer, dimension(n) :: a
-  integer, dimension(n, n, n) :: b
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
-  do concurrent(i=1:n, j=1:bar(n*m, n/m))
-    a(i) = n
-  end do
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
-  do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m))
-    a(i) = n
-  end do
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
-  do concurrent(i=bar(n, x):n)
-    do concurrent(j=1:bar(n*m, n/m))
-      a(i) = n
-    end do
-  end do
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
-  do concurrent(i=1:n)
-    x = 10
-    do concurrent(j=1:m)
-      b(i,j,k) = i * j + k
-    end do
-  end do
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
-  do concurrent(i=1:n)
-    do concurrent(j=1:m)
-      b(i,j,k) = i * j + k
-    end do
-    x = 10
-  end do
-
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
-  do concurrent(i=1:n)
-    do concurrent(j=1:m)
-      b(i,j,k) = i * j + k
-      x = 10
-    end do
-  end do
-
-  ! Verify the (i,j) and (j,k) pairs of loops are detected as perfectly nested.
-  !
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 3]]:{{.*}}) is perfectly nested
-  ! CHECK: Loop pair starting at location
-  ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
-  do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m), k=1:bar(n*m, bar(n*m, n/m)))
-    a(i) = n
-  end do
-end subroutine
-
-pure function bar(n, m)
-    implicit none
-    integer, intent(in) :: n, m
-    integer :: bar
-
-    bar = n + m
-end function
-
-end program main
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index 26800678d381c..d0210726de83e 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
 
 ! RUN: split-file %s %t
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index 23a3aae976c07..cd1bd4f98a3f5 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
 
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index d1c02101318ab..74799359e0476 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
 ! we skip converting the not-perfectly nested `do concurrent` loop.
 
@@ -22,23 +19,24 @@ program main
    end do
 end
 
-! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
-
-! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
-
 ! CHECK: omp.parallel {
 
 ! CHECK: omp.wsloop {
 ! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
-! CHECK:   fir.do_loop %[[J_IV:.*]] = {{.*}} {
-! CHECK:     %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! CHECK:   fir.do_concurrent {
+
+! CHECK:     %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK:     %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! CHECK:     %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK:     %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+
+! CHECK:     fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
+! CHECK:       %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
 ! CHECK:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
 
-! CHECK:     fir.do_loop %[[K_IV:.*]] = {{.*}} {
 ! CHECK:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
-! CHECK:         fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! CHECK:       fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
 ! CHECK:     }
 ! CHECK:   }
 ! CHECK: omp.yield