[flang-commits] [flang] 30990c0 - Revert "[flang][fir] Lower `do concurrent` loop nests to `fir.do_concurrent` (#132904)" (#135904)

Wed Apr 16 05:20:31 PDT 2025

Author: Kareem Ergawy
Date: 2025-04-16T07:20:27-05:00
New Revision: 30990c09c99bdcbfa7084d32b2b9851e19b6fb2a

URL: https://github.com/llvm/llvm-project/commit/30990c09c99bdcbfa7084d32b2b9851e19b6fb2a
DIFF: https://github.com/llvm/llvm-project/commit/30990c09c99bdcbfa7084d32b2b9851e19b6fb2a.diff

LOG: Revert "[flang][fir] Lower `do concurrent` loop nests to `fir.do_concurrent` (#132904)" (#135904)

This reverts commit 04b87e15e40f8857e29ade8321b8b67691545a50.

The reasons for reverting is that the following:
1. I still need need to upstream some part of the do concurrent to
OpenMP pass from our downstream implementation and taking this in
downstream will make things more difficult.
2. I still need to work on a solution for modeling locality specifiers
on `hlfir.do_concurrent` ops. I would prefer to do that and merge the
entire stack together instead of having a partial solution.

After merging the revert I will reopen the origianl PR and keep it
updated against main until I finish the above.

Added: 
    

Modified: 
    flang/lib/Lower/Bridge.cpp
    flang/lib/Optimizer/Builder/FIRBuilder.cpp
    flang/test/Lower/do_concurrent.f90
    flang/test/Lower/do_concurrent_local_default_init.f90
    flang/test/Lower/loops.f90
    flang/test/Lower/loops3.f90
    flang/test/Lower/nsw.f90
    flang/test/Transforms/DoConcurrent/basic_host.f90
    flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
    flang/test/Transforms/DoConcurrent/loop_nest_test.f90
    flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
    flang/test/Transforms/DoConcurrent/non_const_bounds.f90
    flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90

Removed: 
    


################################################################################
diff  --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 625dd116fe726..b4d1197822a43 100644

--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -94,11 +94,10 @@ struct IncrementLoopInfo {
   template <typename T>
   explicit IncrementLoopInfo(Fortran::semantics::Symbol &sym, const T &lower,
                              const T &upper, const std::optional<T> &step,
-                             bool isConcurrent = false)
+                             bool isUnordered = false)
       : loopVariableSym{&sym}, lowerExpr{Fortran::semantics::GetExpr(lower)},
         upperExpr{Fortran::semantics::GetExpr(upper)},
-        stepExpr{Fortran::semantics::GetExpr(step)},
-        isConcurrent{isConcurrent} {}
+        stepExpr{Fortran::semantics::GetExpr(step)}, isUnordered{isUnordered} {}
 
   IncrementLoopInfo(IncrementLoopInfo &&) = default;
   IncrementLoopInfo &operator=(IncrementLoopInfo &&x) = default;
@@ -121,7 +120,7 @@ struct IncrementLoopInfo {
   const Fortran::lower::SomeExpr *upperExpr;
   const Fortran::lower::SomeExpr *stepExpr;
   const Fortran::lower::SomeExpr *maskExpr = nullptr;
-  bool isConcurrent;
+  bool isUnordered; // do concurrent, forall
   llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
   llvm::SmallVector<
@@ -131,7 +130,7 @@ struct IncrementLoopInfo {
   mlir::Value loopVariable = nullptr;
 
   // Data members for structured loops.
-  mlir::Operation *loopOp = nullptr;
+  fir::DoLoopOp doLoop = nullptr;
 
   // Data members for unstructured loops.
   bool hasRealControl = false;
@@ -1981,7 +1980,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     llvm_unreachable("illegal reduction operator");
   }
 
-  /// Collect DO CONCURRENT loop control information.
+  /// Collect DO CONCURRENT or FORALL loop control information.
   IncrementLoopNestInfo getConcurrentControl(
       const Fortran::parser::ConcurrentHeader &header,
       const std::list<Fortran::parser::LocalitySpec> &localityList = {}) {
@@ -2292,14 +2291,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
         builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua,
         /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
-    if (has_attrs) {
-      if (auto loopOp = mlir::dyn_cast<fir::DoLoopOp>(info.loopOp))
-        loopOp.setLoopAnnotationAttr(la);
-
-      if (auto doConcurrentOp =
-              mlir::dyn_cast<fir::DoConcurrentLoopOp>(info.loopOp))
-        doConcurrentOp.setLoopAnnotationAttr(la);
-    }
+    if (has_attrs)
+      info.doLoop.setLoopAnnotationAttr(la);
   }
 
   /// Generate FIR to begin a structured or unstructured increment loop nest.
@@ -2308,77 +2301,96 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
     assert(!incrementLoopNestInfo.empty() && "empty loop nest");
     mlir::Location loc = toLocation();
+    mlir::Operation *boundsAndStepIP = nullptr;
     mlir::arith::IntegerOverflowFlags iofBackup{};
 
-    llvm::SmallVector<mlir::Value> nestLBs;
-    llvm::SmallVector<mlir::Value> nestUBs;
-    llvm::SmallVector<mlir::Value> nestSts;
-    llvm::SmallVector<mlir::Value> nestReduceOperands;
-    llvm::SmallVector<mlir::Attribute> nestReduceAttrs;
-    bool genDoConcurrent = false;
-
     for (IncrementLoopInfo &info : incrementLoopNestInfo) {
-      genDoConcurrent = info.isStructured() && info.isConcurrent;
-
-      if (!genDoConcurrent)
-        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
-                                                   info.isConcurrent);
-
-      if (!getLoweringOptions().getIntegerWrapAround()) {
-        iofBackup = builder->getIntegerOverflowFlags();
-        builder->setIntegerOverflowFlags(
-            mlir::arith::IntegerOverflowFlags::nsw);
-      }
+      mlir::Value lowerValue;
+      mlir::Value upperValue;
+      mlir::Value stepValue;
 
-      nestLBs.push_back(genControlValue(info.lowerExpr, info));
-      nestUBs.push_back(genControlValue(info.upperExpr, info));
-      bool isConst = true;
-      nestSts.push_back(genControlValue(
-          info.stepExpr, info, info.isStructured() ? nullptr : &isConst));
+      {
+        mlir::OpBuilder::InsertionGuard guard(*builder);
 
-      if (!getLoweringOptions().getIntegerWrapAround())
-        builder->setIntegerOverflowFlags(iofBackup);
+        // Set the IP before the first loop in the nest so that all nest bounds
+        // and step values are created outside the nest.
+        if (boundsAndStepIP)
+          builder->setInsertionPointAfter(boundsAndStepIP);
 
-      // Use a temp variable for unstructured loops with non-const step.
-      if (!isConst) {
-        mlir::Value stepValue = nestSts.back();
-        info.stepVariable = builder->createTemporary(loc, stepValue.getType());
-        builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
-      }
-
-      if (genDoConcurrent && nestReduceOperands.empty()) {
-        // Create DO CONCURRENT reduce operands and attributes
-        for (const auto &reduceSym : info.reduceSymList) {
-          const fir::ReduceOperationEnum reduceOperation = reduceSym.first;
-          const Fortran::semantics::Symbol *sym = reduceSym.second;
-          fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
-          nestReduceOperands.push_back(fir::getBase(exv));
-          auto reduceAttr =
-              fir::ReduceAttr::get(builder->getContext(), reduceOperation);
-          nestReduceAttrs.push_back(reduceAttr);
+        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
+                                                   info.isUnordered);
+        if (!getLoweringOptions().getIntegerWrapAround()) {
+          iofBackup = builder->getIntegerOverflowFlags();
+          builder->setIntegerOverflowFlags(
+              mlir::arith::IntegerOverflowFlags::nsw);
+        }
+        lowerValue = genControlValue(info.lowerExpr, info);
+        upperValue = genControlValue(info.upperExpr, info);
+        bool isConst = true;
+        stepValue = genControlValue(info.stepExpr, info,
+                                    info.isStructured() ? nullptr : &isConst);
+        if (!getLoweringOptions().getIntegerWrapAround())
+          builder->setIntegerOverflowFlags(iofBackup);
+        boundsAndStepIP = stepValue.getDefiningOp();
+
+        // Use a temp variable for unstructured loops with non-const step.
+        if (!isConst) {
+          info.stepVariable =
+              builder->createTemporary(loc, stepValue.getType());
+          boundsAndStepIP =
+              builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
         }
       }
-    }
 
-    for (auto [info, lowerValue, upperValue, stepValue] :
-         llvm::zip_equal(incrementLoopNestInfo, nestLBs, nestUBs, nestSts)) {
       // Structured loop - generate fir.do_loop.
       if (info.isStructured()) {
-        if (genDoConcurrent)
-          continue;
-
-        // The loop variable is a doLoop op argument.
         mlir::Type loopVarType = info.getLoopVariableType();
-        auto loopOp = builder->create<fir::DoLoopOp>(
-            loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
-            /*finalCountValue=*/true,
-            builder->createConvert(loc, loopVarType, lowerValue));
-        info.loopOp = loopOp;
-        builder->setInsertionPointToStart(loopOp.getBody());
-        mlir::Value loopValue = loopOp.getRegionIterArgs()[0];
-
+        mlir::Value loopValue;
+        if (info.isUnordered) {
+          llvm::SmallVector<mlir::Value> reduceOperands;
+          llvm::SmallVector<mlir::Attribute> reduceAttrs;
+          // Create DO CONCURRENT reduce operands and attributes
+          for (const auto &reduceSym : info.reduceSymList) {
+            const fir::ReduceOperationEnum reduce_operation = reduceSym.first;
+            const Fortran::semantics::Symbol *sym = reduceSym.second;
+            fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
+            reduceOperands.push_back(fir::getBase(exv));
+            auto reduce_attr =
+                fir::ReduceAttr::get(builder->getContext(), reduce_operation);
+            reduceAttrs.push_back(reduce_attr);
+          }
+          // The loop variable value is explicitly updated.
+          info.doLoop = builder->create<fir::DoLoopOp>(
+              loc, lowerValue, upperValue, stepValue, /*unordered=*/true,
+              /*finalCountValue=*/false, /*iterArgs=*/std::nullopt,
+              llvm::ArrayRef<mlir::Value>(reduceOperands), reduceAttrs);
+          builder->setInsertionPointToStart(info.doLoop.getBody());
+          loopValue = builder->createConvert(loc, loopVarType,
+                                             info.doLoop.getInductionVar());
+        } else {
+          // The loop variable is a doLoop op argument.
+          info.doLoop = builder->create<fir::DoLoopOp>(
+              loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
+              /*finalCountValue=*/true,
+              builder->createConvert(loc, loopVarType, lowerValue));
+          builder->setInsertionPointToStart(info.doLoop.getBody());
+          loopValue = info.doLoop.getRegionIterArgs()[0];
+        }
         // Update the loop variable value in case it has non-index references.
         builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
+        if (info.maskExpr) {
+          Fortran::lower::StatementContext stmtCtx;
+          mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
+          stmtCtx.finalizeAndReset();
+          mlir::Value maskCondCast =
+              builder->createConvert(loc, builder->getI1Type(), maskCond);
+          auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
+                                                 /*withElseRegion=*/false);
+          builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
+        }
+        if (info.hasLocalitySpecs())
+          handleLocalitySpecs(info);
+
         addLoopAnnotationAttr(info, dirs);
         continue;
       }
@@ -2442,60 +2454,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         builder->restoreInsertionPoint(insertPt);
       }
     }
-
-    if (genDoConcurrent) {
-      auto loopWrapperOp = builder->create<fir::DoConcurrentOp>(loc);
-      builder->setInsertionPointToStart(
-          builder->createBlock(&loopWrapperOp.getRegion()));
-
-      for (IncrementLoopInfo &info : llvm::reverse(incrementLoopNestInfo)) {
-        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
-                                                   info.isConcurrent);
-      }
-
-      builder->setInsertionPointToEnd(loopWrapperOp.getBody());
-      auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
-          loc, nestLBs, nestUBs, nestSts, nestReduceOperands,
-          nestReduceAttrs.empty()
-              ? nullptr
-              : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs),
-          nullptr);
-
-      llvm::SmallVector<mlir::Type> loopBlockArgTypes(
-          incrementLoopNestInfo.size(), builder->getIndexType());
-      llvm::SmallVector<mlir::Location> loopBlockArgLocs(
-          incrementLoopNestInfo.size(), loc);
-      mlir::Region &loopRegion = loopOp.getRegion();
-      mlir::Block *loopBlock = builder->createBlock(
-          &loopRegion, loopRegion.begin(), loopBlockArgTypes, loopBlockArgLocs);
-      builder->setInsertionPointToStart(loopBlock);
-
-      for (auto [info, blockArg] :
-           llvm::zip_equal(incrementLoopNestInfo, loopBlock->getArguments())) {
-        info.loopOp = loopOp;
-        mlir::Value loopValue =
-            builder->createConvert(loc, info.getLoopVariableType(), blockArg);
-        builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
-
-        if (info.maskExpr) {
-          Fortran::lower::StatementContext stmtCtx;
-          mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
-          stmtCtx.finalizeAndReset();
-          mlir::Value maskCondCast =
-              builder->createConvert(loc, builder->getI1Type(), maskCond);
-          auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
-                                                 /*withElseRegion=*/false);
-          builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
-        }
-      }
-
-      IncrementLoopInfo &innermostInfo = incrementLoopNestInfo.back();
-
-      if (innermostInfo.hasLocalitySpecs())
-        handleLocalitySpecs(innermostInfo);
-
-      addLoopAnnotationAttr(innermostInfo, dirs);
-    }
   }
 
   /// Generate FIR to end a structured or unstructured increment loop nest.
@@ -2512,31 +2470,29 @@ class FirConverter : public Fortran::lower::AbstractConverter {
          it != rend; ++it) {
       IncrementLoopInfo &info = *it;
       if (info.isStructured()) {
-        // End fir.do_concurent.loop.
-        if (info.isConcurrent) {
-          builder->setInsertionPointAfter(info.loopOp->getParentOp());
+        // End fir.do_loop.
+        if (info.isUnordered) {
+          builder->setInsertionPointAfter(info.doLoop);
           continue;
         }
-
-        // End fir.do_loop.
         // Decrement tripVariable.
-        auto doLoopOp = mlir::cast<fir::DoLoopOp>(info.loopOp);
-        builder->setInsertionPointToEnd(doLoopOp.getBody());
+        builder->setInsertionPointToEnd(info.doLoop.getBody());
         llvm::SmallVector<mlir::Value, 2> results;
         results.push_back(builder->create<mlir::arith::AddIOp>(
-            loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr));
+            loc, info.doLoop.getInductionVar(), info.doLoop.getStep(),
+            iofAttr));
         // Step loopVariable to help optimizations such as vectorization.
         // Induction variable elimination will clean up as necessary.
         mlir::Value step = builder->createConvert(
-            loc, info.getLoopVariableType(), doLoopOp.getStep());
+            loc, info.getLoopVariableType(), info.doLoop.getStep());
         mlir::Value loopVar =
             builder->create<fir::LoadOp>(loc, info.loopVariable);
         results.push_back(
             builder->create<mlir::arith::AddIOp>(loc, loopVar, step, iofAttr));
         builder->create<fir::ResultOp>(loc, results);
-        builder->setInsertionPointAfter(doLoopOp);
+        builder->setInsertionPointAfter(info.doLoop);
         // The loop control variable may be used after the loop.
-        builder->create<fir::StoreOp>(loc, doLoopOp.getResult(1),
+        builder->create<fir::StoreOp>(loc, info.doLoop.getResult(1),
                                       info.loopVariable);
         continue;
       }

diff  --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index d35367d7657cf..3cf9b5ae72d9e 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -280,9 +280,6 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto cufKernelOp = getRegion().getParentOfType<cuf::KernelOp>())
     return &cufKernelOp.getRegion().front();
 
-  if (auto doConcurentOp = getRegion().getParentOfType<fir::DoConcurrentOp>())
-    return doConcurentOp.getBody();
-
   return getEntryBlock();
 }
 

diff  --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90
index cc113f59c35e3..ef93d2d6b035b 100644
--- a/flang/test/Lower/do_concurrent.f90
+++ b/flang/test/Lower/do_concurrent.f90
@@ -14,9 +14,6 @@ subroutine sub1(n)
    implicit none
    integer :: n, m, i, j, k
    integer, dimension(n) :: a
-!CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFsub1En"}
-!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Ea"}
-
 !CHECK: %[[LB1:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
 !CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
@@ -32,30 +29,10 @@ subroutine sub1(n)
 !CHECK: %[[UB3:.*]] = arith.constant 10 : i32
 !CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index
 
-!CHECK: fir.do_concurrent
-!CHECK:   %[[I:.*]] = fir.alloca i32 {bindc_name = "i"}
-!CHECK:   %[[I_DECL:.*]]:2 = hlfir.declare %[[I]]
-!CHECK:   %[[J:.*]] = fir.alloca i32 {bindc_name = "j"}
-!CHECK:   %[[J_DECL:.*]]:2 = hlfir.declare %[[J]]
-!CHECK:   %[[K:.*]] = fir.alloca i32 {bindc_name = "k"}
-!CHECK:   %[[K_DECL:.*]]:2 = hlfir.declare %[[K]]
-
-!CHECK:   fir.do_concurrent.loop (%[[I_IV:.*]], %[[J_IV:.*]], %[[K_IV:.*]]) =
-!CHECK-SAME:                     (%[[LB1_CVT]], %[[LB2_CVT]], %[[LB3_CVT]]) to
-!CHECK-SAME:                     (%[[UB1_CVT]], %[[UB2_CVT]], %[[UB3_CVT]]) step
-!CHECK-SAME:                     (%{{.*}}, %{{.*}}, %{{.*}}) {
-!CHECK:       %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
-!CHECK:       fir.store %[[I_IV_CVT]] to %[[I_DECL]]#0 : !fir.ref<i32>
-!CHECK:       %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
-!CHECK:       fir.store %[[J_IV_CVT]] to %[[J_DECL]]#0 : !fir.ref<i32>
-!CHECK:       %[[K_IV_CVT:.*]] = fir.convert %[[K_IV]] : (index) -> i32
-!CHECK:       fir.store %[[K_IV_CVT]] to %[[K_DECL]]#0 : !fir.ref<i32>
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered
 
-!CHECK:       %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref<i32>
-!CHECK:       %[[I_VAL:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-!CHECK:       %[[I_VAL_CVT:.*]] = fir.convert %[[I_VAL]] : (i32) -> i64
-!CHECK:       %[[A_ELEM:.*]] = hlfir.designate %[[A_DECL]]#0 (%[[I_VAL_CVT]])
-!CHECK:       hlfir.assign %[[N_VAL]] to %[[A_ELEM]] : i32, !fir.ref<i32>
    do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10)
       a(i) = n
    end do
@@ -68,17 +45,14 @@ subroutine sub2(n)
    integer, dimension(n) :: a
 !CHECK: %[[LB1:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
-!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
 !CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
-!CHECK: fir.do_concurrent
-!CHECK:   fir.do_concurrent.loop (%{{.*}}) = (%[[LB1_CVT]]) to (%[[UB1_CVT]]) step (%{{.*}})
-
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
 !CHECK: %[[LB2:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
 !CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
 !CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
-!CHECK: fir.do_concurrent
-!CHECK:   fir.do_concurrent.loop (%{{.*}}) = (%[[LB2_CVT]]) to (%[[UB2_CVT]]) step (%{{.*}})
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
    do concurrent(i=1:n)
       do concurrent(j=1:bar(n*m, n/m))
          a(i) = n
@@ -86,6 +60,7 @@ subroutine sub2(n)
    end do
 end subroutine
 
+
 !CHECK-LABEL: unstructured
 subroutine unstructured(inner_step)
   integer(4) :: i, j, inner_step

diff  --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 207704ac1a990..7652e4fcd0402 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -29,7 +29,7 @@ subroutine test_default_init()
 ! CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>> {fir.bindc_name = "p"}) {
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>) -> index
-! CHECK:           fir.do_concurrent.loop
+! CHECK:           fir.do_loop
 ! CHECK:             %[[VAL_16:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "p", pinned, uniq_name = "_QFtest_ptrEp"}
 ! CHECK:             %[[VAL_17:.*]] = fir.zero_bits !fir.ptr<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 0 : index
@@ -43,7 +43,7 @@ subroutine test_default_init()
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QPtest_default_init(
-! CHECK:           fir.do_concurrent.loop
+! CHECK:           fir.do_loop
 ! CHECK:             %[[VAL_26:.*]] = fir.alloca !fir.type<_QFtest_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFtest_default_initEa"}
 ! CHECK:             %[[VAL_27:.*]] = fir.embox %[[VAL_26]] : (!fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>
 ! CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<none>

diff  --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc3..ea65ba3e4d66d 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -2,6 +2,15 @@
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
+  ! CHECK: %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i"}
+  ! CHECK: %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i"}
+  ! CHECK: %[[VAL_4:.*]] = fir.alloca i16 {bindc_name = "i"}
+  ! CHECK: %[[VAL_5:.*]] = fir.alloca i8 {bindc_name = "k"}
+  ! CHECK: %[[VAL_6:.*]] = fir.alloca i8 {bindc_name = "j"}
+  ! CHECK: %[[VAL_7:.*]] = fir.alloca i8 {bindc_name = "i"}
+  ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "k"}
+  ! CHECK: %[[VAL_9:.*]] = fir.alloca i32 {bindc_name = "j"}
+  ! CHECK: %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i"}
   ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<5x5x5xi32> {bindc_name = "a", uniq_name = "_QFloop_testEa"}
   ! CHECK: %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "asum", uniq_name = "_QFloop_testEasum"}
   ! CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFloop_testEi"}
@@ -16,7 +25,7 @@ subroutine loop_test
   j = 200
   k = 300
 
-  ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}}
+  ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered
   do concurrent (i=1:5, j=1:5, k=1:5) ! shared(a)
     ! CHECK: fir.coordinate_of
     a(i,j,k) = 0
@@ -24,7 +33,7 @@ subroutine loop_test
   ! CHECK: fir.call @_FortranAioBeginExternalListOutput
   print*, 'A:', i, j, k
 
-  ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}}
+  ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered
   ! CHECK: fir.if
   do concurrent (integer(1)::i=1:5, j=1:5, k=1:5, i.ne.j .and. k.ne.3) shared(a)
     ! CHECK-COUNT-2: fir.coordinate_of
@@ -44,7 +53,7 @@ subroutine loop_test
   ! CHECK: fir.call @_FortranAioBeginExternalListOutput
   print*, 'B:', i, j, k, '-', asum
 
-  ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}}
+  ! CHECK: fir.do_loop {{.*}} unordered
   ! CHECK-COUNT-2: fir.if
   do concurrent (integer(2)::i=1:5, i.ne.3)
     if (i.eq.2 .or. i.eq.4) goto 5 ! fir.if
@@ -53,7 +62,7 @@ subroutine loop_test
   5 continue
   enddo
 
-  ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}}
+  ! CHECK: fir.do_loop {{.*}} unordered
   ! CHECK-COUNT-2: fir.if
   do concurrent (integer(2)::i=1:5, i.ne.3)
     if (i.eq.2 .or. i.eq.4) then ! fir.if
@@ -84,6 +93,10 @@ end subroutine loop_test
 
 ! CHECK-LABEL: c.func @_QPlis
 subroutine lis(n)
+  ! CHECK-DAG: fir.alloca i32 {bindc_name = "m"}
+  ! CHECK-DAG: fir.alloca i32 {bindc_name = "j"}
+  ! CHECK-DAG: fir.alloca i32 {bindc_name = "i"}
+  ! CHECK-DAG: fir.alloca i8 {bindc_name = "i"}
   ! CHECK-DAG: fir.alloca i32 {bindc_name = "j", uniq_name = "_QFlisEj"}
   ! CHECK-DAG: fir.alloca i32 {bindc_name = "k", uniq_name = "_QFlisEk"}
   ! CHECK-DAG: fir.alloca !fir.box<!fir.ptr<!fir.array<?x?x?xi32>>> {bindc_name = "p", uniq_name = "_QFlisEp"}
@@ -104,8 +117,8 @@ subroutine lis(n)
   ! CHECK:     }
   r = 0
 
-  ! CHECK:     fir.do_concurrent {
-  ! CHECK:       fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+  ! CHECK:     fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %{{.*}} unordered {
+  ! CHECK:       fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg3 = %{{.*}}) -> (index, i32) {
   ! CHECK:       }
   ! CHECK:     }
   do concurrent (integer(kind=1)::i=n:1:-1)
@@ -115,18 +128,16 @@ subroutine lis(n)
     enddo
   enddo
 
-  ! CHECK:       fir.do_concurrent.loop (%{{.*}}, %{{.*}}) = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) {
+  ! CHECK:     fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
+  ! CHECK:       fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
   ! CHECK:         fir.if %{{.*}} {
   ! CHECK:           %[[V_95:[0-9]+]] = fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "t", pinned, uniq_name = "_QFlisEt"}
   ! CHECK:           %[[V_96:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?x?xi32>>> {bindc_name = "p", pinned, uniq_name = "_QFlisEp"}
   ! CHECK:           fir.store %{{.*}} to %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
   ! CHECK:           fir.do_loop %arg3 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg4 = %{{.*}}) -> (index, i32) {
-  ! CHECK:             fir.do_concurrent {
-  ! CHECK:               fir.alloca i32 {bindc_name = "m"}
-  ! CHECK:               fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  ! CHECK:                 fir.load %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
-  ! CHECK:                 fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-  ! CHECK:               }
+  ! CHECK:             fir.do_loop %arg5 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
+  ! CHECK:               fir.load %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
+  ! CHECK:               fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>
   ! CHECK:             }
   ! CHECK:           }
   ! CHECK:           fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>

diff  --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca16..78f39e1013082 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -12,7 +12,9 @@ subroutine loop_test
 
   ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"}
   ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref<i32>
-  ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
   do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m)
     tmp = i + j + k
     sum = tmp + sum

diff  --git a/flang/test/Lower/nsw.f90 b/flang/test/Lower/nsw.f90
index 2ec1efb2af42a..4ee9e5da829e6 100644
--- a/flang/test/Lower/nsw.f90
+++ b/flang/test/Lower/nsw.f90
@@ -139,6 +139,7 @@ subroutine loop_params3(a,lb,ub,st)
 ! CHECK-LABEL:   func.func @_QPloop_params3(
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_11:.*]] = fir.declare %{{.*}}lb"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_12:.*]] = fir.declare %{{.*}}ub"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_14:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
@@ -152,6 +153,4 @@ subroutine loop_params3(a,lb,ub,st)
 ! CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_32:.*]] = arith.muli %[[VAL_31]], %[[VAL_4]] overflow<nsw> : i32
 ! CHECK:           %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> index
-! CHECK:           fir.do_concurrent {
-! CHECK:             %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
-! CHECK:             fir.do_concurrent.loop (%[[VAL_34:.*]]) = (%[[VAL_28]]) to (%[[VAL_30]]) step (%[[VAL_33]]) {
+! CHECK:           fir.do_loop %[[VAL_34:.*]] = %[[VAL_28]] to %[[VAL_30]] step %[[VAL_33]] unordered {

diff  --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index b84d4481ac766..12f63031cbaee 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \

diff  --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index 4e13c0919589a..f82696669eca6 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests that "loop-local values" are properly handled by localizing them to the
 ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue`
 ! for a definition of "loop-local values" and how they are handled.

diff  --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
index adc4a488d1ec9..32bed61fe69e4 100644
--- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
+++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests loop-nest detection algorithm for do-concurrent mapping.
 
 ! REQUIRES: asserts

diff  --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index 26800678d381c..d0210726de83e 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
 
 ! RUN: split-file %s %t

diff  --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index 23a3aae976c07..cd1bd4f98a3f5 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
 

diff  --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index d1c02101318ab..184fdfe00d397 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,6 +1,3 @@
-! Fails until we update the pass to use the `fir.do_concurrent` op.
-! XFAIL: *
-
 ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
 ! we skip converting the not-perfectly nested `do concurrent` loop.