[flang-commits] [flang] [flang][hlfir] Add MLIR op for `do concurrent` (PR #130893)

Sun Mar 16 23:16:24 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/130893

>From 99c76085fe77cbb286237e980ab7a642ca31a7c9 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 10 Mar 2025 23:47:04 -0500
Subject: [PATCH 1/4] [flang][hlfir] Add MLIR op for `do concurrent`

Adds new MLIR ops to model `do concurrent`. In order to make `do
concurrent` representation self-contained, a loop is modeled using 2
ops, one wrapper and one that contains the actual body of the loop. For
example, a 2D `do concurrent` loop is modeled as follows:

```mlir
  hlfir.do_concurrent {
    %i = fir.alloca i32
    %j = fir.alloca i32
    hlfir.do_concurrent.loop
      (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) {
      %0 = fir.convert %i_iv : (index) -> i32
      fir.store %0 to %i : !fir.ref<i32>

      %1 = fir.convert %j_iv : (index) -> i32
      fir.store %1 to %j : !fir.ref<i32>
    }
  }
```

The `hlfir.do_concurrent` wrapper op encapsulates both the actual loop
and the allocations required for the iteration variables. The
`hlfir.do_concurrent.loop` op is a multi-dimensional op that contains
the loop control and body. See the ops' docs for more info.
---
 .../include/flang/Optimizer/HLFIR/HLFIROps.td | 116 +++++++++++++
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     | 163 ++++++++++++++++++
 flang/test/HLFIR/do_concurrent.fir            |  92 ++++++++++
 flang/test/HLFIR/invalid.fir                  |  95 ++++++++++
 4 files changed, 466 insertions(+)
 create mode 100644 flang/test/HLFIR/do_concurrent.fir

diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index f69930d5b53b3..089c67af5d313 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -21,6 +21,7 @@ include "flang/Optimizer/Dialect/FIRAttr.td"
 include "flang/Optimizer/Dialect/FortranVariableInterface.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td"
+include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td"
 include "mlir/IR/BuiltinAttributes.td"
 
 // Base class for FIR operations.
@@ -1863,5 +1864,120 @@ def hlfir_EvaluateInMemoryOp : hlfir_Op<"eval_in_mem", [AttrSizedOperandSegments
   let hasVerifier = 1;
 }
 
+def hlfir_DoConcurrentOp : hlfir_Op<"do_concurrent", [SingleBlock]> {
+  let summary = "do concurrent loop wrapper";
+
+  let description = [{
+    A wrapper operation for the actual op modeling `do concurrent` loops:
+    `hlfir.do_concurrent.loop` (see op declaration below for more info about it).
+
+    The `hlfir.do_concurrent` wrapper op consists of one single-block region with
+    the following properties:
+    - The first ops in the region are responsible for allocating storage for the
+      loop's iteration variables. This is property is **not** enforced by the op
+      verifier, but expected to be respected when building the op.
+    - The terminator of the region is an instance of `hlfir.do_concurrent.loop`.
+
+    For example, a 2D loop nest would be represented as follows:
+    ```
+    hlfir.do_concurrent {
+      %i = fir.alloca i32
+      %j = fir.alloca i32
+      hlfir.do_concurrent.loop ...
+    }
+    ```
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let assemblyFormat = "$region attr-dict";
+  let hasVerifier = 1;
+}
+
+def hlfir_DoConcurrentLoopOp : hlfir_Op<"do_concurrent.loop",
+    [AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     Terminator, NoTerminator, SingleBlock, ParentOneOf<["DoConcurrentOp"]>]> {
+  let summary = "do concurrent loop";
+
+  let description = [{
+    An operation that models a Fortran `do concurrent` loop's header and block.
+    This is a single-region single-block terminator op that is expected to
+    terminate the region of a `omp.do_concurrent` wrapper op.
+
+    This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
+    `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
+    represent the lower bounds, upper bounds, and steps. Similar to `fir.do_loop`
+    the op takes one additional group of SSA values to represent reductions.
+
+    The body region **does not** have a terminator.
+
+    For example, a 2D loop nest with 2 reductions (sum and max) would be
+    represented as follows:
+    ```
+    // The wrapper of the loop
+    hlfir.do_concurrent {
+      %i = fir.alloca i32
+      %j = fir.alloca i32
+
+      // The actual `do concurrent` loop
+      hlfir.do_concurrent.loop
+        (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st)
+        reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>,
+               #fir.reduce_attr<max> -> %max : !fir.ref<f32>) {
+
+        %0 = fir.convert %i_iv : (index) -> i32
+        fir.store %0 to %i : !fir.ref<i32>
+
+        %1 = fir.convert %j_iv : (index) -> i32
+        fir.store %1 to %j : !fir.ref<i32>
+
+        // ... loop body goes here ...
+      }
+    }
+    ```
+
+    Description of arguments:
+    - `lowerBound`: The group of SSA values for the nest's lower bounds.
+    - `upperBound`: The group of SSA values for the nest's upper bounds.
+    - `step`: The group of SSA values for the nest's steps.
+    - `reduceOperands`: The reduction SSA values, if any.
+    - `reduceAttrs`: Attributes to store reduction operations, if any.
+    - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
+      LLVM.
+  }];
+
+  let arguments = (ins
+    Variadic<Index>:$lowerBound,
+    Variadic<Index>:$upperBound,
+    Variadic<Index>:$step,
+    Variadic<AnyType>:$reduceOperands,
+    OptionalAttr<ArrayAttr>:$reduceAttrs,
+    OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
+  );
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    /// Get Number of variadic operands
+    unsigned getNumOperands(unsigned segmentIdx) {
+      auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>(
+        getOperandSegmentSizeAttr());
+      return static_cast<unsigned>(segments[segmentIdx]);
+    }
+
+    // Get Number of reduction operands
+    unsigned getNumReduceOperands() {
+      return getNumOperands(3);
+    }
+
+    /// Does the operation hold operands for reduction variables
+    bool hasReduceOperands() {
+      return getNumReduceOperands() > 0;
+    }
+  }];
+}
 
 #endif // FORTRAN_DIALECT_HLFIR_OPS
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 8851a3a7187b9..c4e62df655c31 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 
+#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
@@ -2246,6 +2247,168 @@ llvm::LogicalResult hlfir::EvaluateInMemoryOp::verify() {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// DoConcurrentOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult hlfir::DoConcurrentOp::verify() {
+  mlir::Block *body = getBody();
+
+  if (body->empty())
+    return emitOpError("body cannot be empty");
+
+  if (!body->mightHaveTerminator() ||
+      !mlir::isa<hlfir::DoConcurrentLoopOp>(body->getTerminator()))
+    return emitOpError("must be terminated by 'hlfir.do_concurrent.loop'");
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// DoConcurrentLoopOp
+//===----------------------------------------------------------------------===//
+
+mlir::ParseResult
+hlfir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
+                                 mlir::OperationState &result) {
+  auto &builder = parser.getBuilder();
+  // Parse an opening `(` followed by induction variables followed by `)`
+  llvm::SmallVector<mlir::OpAsmParser::Argument, 4> ivs;
+  if (parser.parseArgumentList(ivs, mlir::OpAsmParser::Delimiter::Paren))
+    return mlir::failure();
+
+  // Parse loop bounds.
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> lower;
+  if (parser.parseEqual() ||
+      parser.parseOperandList(lower, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> upper;
+  if (parser.parseKeyword("to") ||
+      parser.parseOperandList(upper, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  // Parse step values.
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> steps;
+  if (parser.parseKeyword("step") ||
+      parser.parseOperandList(steps, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands;
+  llvm::SmallVector<mlir::Type> reduceArgTypes;
+  if (succeeded(parser.parseOptionalKeyword("reduce"))) {
+    // Parse reduction attributes and variables.
+    llvm::SmallVector<fir::ReduceAttr> attributes;
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::Paren, [&]() {
+              if (parser.parseAttribute(attributes.emplace_back()) ||
+                  parser.parseArrow() ||
+                  parser.parseOperand(reduceOperands.emplace_back()) ||
+                  parser.parseColonType(reduceArgTypes.emplace_back()))
+                return mlir::failure();
+              return mlir::success();
+            })))
+      return mlir::failure();
+    // Resolve input operands.
+    for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes))
+      if (parser.resolveOperand(std::get<0>(operand_type),
+                                std::get<1>(operand_type), result.operands))
+        return mlir::failure();
+    llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                                 attributes.end());
+    result.addAttribute(getReduceAttrsAttrName(result.name),
+                        builder.getArrayAttr(arrayAttr));
+  }
+
+  // Now parse the body.
+  mlir::Region *body = result.addRegion();
+  for (auto &iv : ivs)
+    iv.type = builder.getIndexType();
+  if (parser.parseRegion(*body, ivs))
+    return mlir::failure();
+
+  // Set `operandSegmentSizes` attribute.
+  result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(lower.size()),
+                           static_cast<int32_t>(upper.size()),
+                           static_cast<int32_t>(steps.size()),
+                           static_cast<int32_t>(reduceOperands.size())}));
+
+  // Parse attributes.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return mlir::failure();
+
+  return mlir::success();
+}
+
+void hlfir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
+  p << " (" << getBody()->getArguments() << ") = (" << getLowerBound()
+    << ") to (" << getUpperBound() << ") step (" << getStep() << ")";
+
+  if (hasReduceOperands()) {
+    p << " reduce(";
+    auto attrs = getReduceAttrsAttr();
+    auto operands = getReduceOperands();
+    llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) {
+      p << std::get<0>(it) << " -> " << std::get<1>(it) << " : "
+        << std::get<1>(it).getType();
+    });
+    p << ')';
+  }
+
+  p << ' ';
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(
+      (*this)->getAttrs(),
+      /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+                       DoConcurrentLoopOp::getReduceAttrsAttrName()});
+}
+
+llvm::SmallVector<mlir::Region *> hlfir::DoConcurrentLoopOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+llvm::LogicalResult hlfir::DoConcurrentLoopOp::verify() {
+  mlir::Operation::operand_range lbValues = getLowerBound();
+  mlir::Operation::operand_range ubValues = getUpperBound();
+  mlir::Operation::operand_range stepValues = getStep();
+
+  if (lbValues.empty())
+    return emitOpError(
+        "needs at least one tuple element for lowerBound, upperBound and step");
+
+  if (lbValues.size() != ubValues.size() ||
+      ubValues.size() != stepValues.size())
+    return emitOpError("different number of tuple elements for lowerBound, "
+                       "upperBound or step");
+
+  // Check that the body defines the same number of block arguments as the
+  // number of tuple elements in step.
+  mlir::Block *body = getBody();
+  if (body->getNumArguments() != stepValues.size())
+    return emitOpError() << "expects the same number of induction variables: "
+                         << body->getNumArguments()
+                         << " as bound and step values: " << stepValues.size();
+  for (auto arg : body->getArguments())
+    if (!arg.getType().isIndex())
+      return emitOpError(
+          "expects arguments for the induction variable to be of index type");
+
+  auto reduceAttrs = getReduceAttrsAttr();
+  if (getNumReduceOperands() != (reduceAttrs ? reduceAttrs.size() : 0))
+    return emitOpError(
+        "mismatch in number of reduction variables and reduction attributes");
+
+  return mlir::success();
+}
+
 #include "flang/Optimizer/HLFIR/HLFIROpInterfaces.cpp.inc"
 #define GET_OP_CLASSES
 #include "flang/Optimizer/HLFIR/HLFIREnums.cpp.inc"
diff --git a/flang/test/HLFIR/do_concurrent.fir b/flang/test/HLFIR/do_concurrent.fir
new file mode 100644
index 0000000000000..aef9db2236a57
--- /dev/null
+++ b/flang/test/HLFIR/do_concurrent.fir
@@ -0,0 +1,92 @@
+// Test hlfir.do_concurrent operation parse, verify (no errors), and unparse
+
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+func.func @dc_1d(%i_lb: index, %i_ub: index, %i_st: index) {
+  hlfir.do_concurrent {
+    %i = fir.alloca i32
+    hlfir.do_concurrent.loop (%i_iv) = (%i_lb) to (%i_ub) step (%i_st) {
+      %0 = fir.convert %i_iv : (index) -> i32
+      fir.store %0 to %i : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @dc_1d
+// CHECK-SAME:    (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index)
+// CHECK:         hlfir.do_concurrent {
+// CHECK:           %[[I:.*]] = fir.alloca i32
+// CHECK:           hlfir.do_concurrent.loop (%[[I_IV:.*]]) = (%[[I_LB]]) to (%[[I_UB]]) step (%[[I_ST]]) {
+// CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:         }
+
+func.func @dc_2d(%i_lb: index, %i_ub: index, %i_st: index,
+                 %j_lb: index, %j_ub: index, %j_st: index) {
+  hlfir.do_concurrent {
+    %i = fir.alloca i32
+    %j = fir.alloca i32
+    hlfir.do_concurrent.loop
+      (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) {
+      %0 = fir.convert %i_iv : (index) -> i32
+      fir.store %0 to %i : !fir.ref<i32>
+
+      %1 = fir.convert %j_iv : (index) -> i32
+      fir.store %1 to %j : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @dc_2d
+// CHECK-SAME:    (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index, %[[J_LB:.*]]: index, %[[J_UB:.*]]: index, %[[J_ST:.*]]: index)
+// CHECK:         hlfir.do_concurrent {
+// CHECK:           %[[I:.*]] = fir.alloca i32
+// CHECK:           %[[J:.*]] = fir.alloca i32
+// CHECK:           hlfir.do_concurrent.loop
+// CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) {
+// CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK:             %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+// CHECK:             fir.store %[[J_IV_CVT]] to %[[J]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:         }
+
+func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
+                 %j_lb: index, %j_ub: index, %j_st: index) {
+  %sum = fir.alloca i32
+
+  hlfir.do_concurrent {
+    %i = fir.alloca i32
+    %j = fir.alloca i32
+    hlfir.do_concurrent.loop
+      (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) 
+      reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) {
+      %0 = fir.convert %i_iv : (index) -> i32
+      fir.store %0 to %i : !fir.ref<i32>
+
+      %1 = fir.convert %j_iv : (index) -> i32
+      fir.store %1 to %j : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @dc_2d_reduction
+// CHECK-SAME:    (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index, %[[J_LB:.*]]: index, %[[J_UB:.*]]: index, %[[J_ST:.*]]: index)
+
+// CHECK:         %[[SUM:.*]] = fir.alloca i32
+
+// CHECK:         hlfir.do_concurrent {
+// CHECK:           %[[I:.*]] = fir.alloca i32
+// CHECK:           %[[J:.*]] = fir.alloca i32
+// CHECK:           hlfir.do_concurrent.loop
+// CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(#fir.reduce_attr<add> -> %[[SUM]] : !fir.ref<i32>) {
+// CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK:             %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+// CHECK:             fir.store %[[J_IV_CVT]] to %[[J]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:         }
diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir
index d61efe0062e69..e14284f916bd9 100644
--- a/flang/test/HLFIR/invalid.fir
+++ b/flang/test/HLFIR/invalid.fir
@@ -1555,3 +1555,98 @@ func.func @bad_reshape(%arg0: !hlfir.expr<1x!fir.char<1,2>>, %arg1: !hlfir.expr<
   %0 = hlfir.reshape %arg0 %arg1 pad %arg2 : (!hlfir.expr<1x!fir.char<1,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2,?>>) -> !hlfir.expr<?x!fir.char<1,?>>
   return
 }
+
+// -----
+
+func.func @empty_dc_wrapper_body() {
+  // expected-error at +1 {{'hlfir.do_concurrent' op expects a non-empty block}}
+  hlfir.do_concurrent {
+  }
+  return
+}
+
+// -----
+
+func.func @dc_wrong_terminator() {
+  // expected-error at +1 {{'hlfir.do_concurrent' op must be terminated by 'hlfir.do_concurrent.loop'}}
+  hlfir.do_concurrent {
+    llvm.return
+  }
+  return
+}
+
+// -----
+
+func.func @dc_0d() {
+   // expected-error at +2 {{'hlfir.do_concurrent.loop' op needs at least one tuple element for lowerBound, upperBound and step}}
+  hlfir.do_concurrent {
+    hlfir.do_concurrent.loop () = () to () step () {
+      %tmp = fir.alloca i32
+    }
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_parent(%arg0: index, %arg1: index) {
+  // expected-error at +1 {{'hlfir.do_concurrent.loop' op expects parent op 'hlfir.do_concurrent'}}
+  "hlfir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+  ^bb0(%arg2: index):
+     %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+  }) : (index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @dc_invalid_control(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'hlfir.do_concurrent.loop' op different number of tuple elements for lowerBound, upperBound or step}}
+  hlfir.do_concurrent {
+    "hlfir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+    ^bb0(%arg2: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'hlfir.do_concurrent.loop' op expects the same number of induction variables: 2 as bound and step values: 1}}
+  hlfir.do_concurrent {
+    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+    ^bb0(%arg3: index, %arg4: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var_type(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'hlfir.do_concurrent.loop' op expects arguments for the induction variable to be of index type}}
+  hlfir.do_concurrent {
+    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+    ^bb0(%arg3: i32):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
+  %sum = fir.alloca i32
+  // expected-error at +2 {{'hlfir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}}
+  hlfir.do_concurrent {
+    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>}> ({
+    ^bb0(%arg3: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index, !fir.ref<i32>) -> ()
+  }
+  return
+}

>From e3db9f02fe47e7c96313636f2ea25e0d2bfdaec9 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 12 Mar 2025 05:49:36 -0500
Subject: [PATCH 2/4] handle review comments

---
 flang/include/flang/Optimizer/HLFIR/HLFIROps.td | 14 +-------------
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp       |  2 +-
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 089c67af5d313..50a054a765faf 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -1961,21 +1961,9 @@ def hlfir_DoConcurrentLoopOp : hlfir_Op<"do_concurrent.loop",
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    /// Get Number of variadic operands
-    unsigned getNumOperands(unsigned segmentIdx) {
-      auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>(
-        getOperandSegmentSizeAttr());
-      return static_cast<unsigned>(segments[segmentIdx]);
-    }
-
     // Get Number of reduction operands
     unsigned getNumReduceOperands() {
-      return getNumOperands(3);
-    }
-
-    /// Does the operation hold operands for reduction variables
-    bool hasReduceOperands() {
-      return getNumReduceOperands() > 0;
+      return getReduceOperands().size();
     }
   }];
 }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index c4e62df655c31..4c19ea414aabb 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -2352,7 +2352,7 @@ void hlfir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
   p << " (" << getBody()->getArguments() << ") = (" << getLowerBound()
     << ") to (" << getUpperBound() << ") step (" << getStep() << ")";
 
-  if (hasReduceOperands()) {
+  if (!getReduceOperands().empty()) {
     p << " reduce(";
     auto attrs = getReduceAttrsAttr();
     auto operands = getReduceOperands();

>From 1c7a3e953b3b752b03179c5e9f095f52b9e8b3b5 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Thu, 13 Mar 2025 00:00:50 -0500
Subject: [PATCH 3/4] handle review comments

---
 flang/include/flang/Optimizer/HLFIR/HLFIROps.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 50a054a765faf..8897961316abc 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -1864,7 +1864,8 @@ def hlfir_EvaluateInMemoryOp : hlfir_Op<"eval_in_mem", [AttrSizedOperandSegments
   let hasVerifier = 1;
 }
 
-def hlfir_DoConcurrentOp : hlfir_Op<"do_concurrent", [SingleBlock]> {
+def hlfir_DoConcurrentOp : hlfir_Op<"do_concurrent", 
+    [SingleBlock, AutomaticAllocationScope]> {
   let summary = "do concurrent loop wrapper";
 
   let description = [{

>From 1dc58b5bf5437f5856b1bb6512cfd878d90b2f01 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 17 Mar 2025 01:16:06 -0500
Subject: [PATCH 4/4] Move ops to fir

---
 .../include/flang/Optimizer/Dialect/FIROps.td | 105 +++++++++++
 .../include/flang/Optimizer/HLFIR/HLFIROps.td | 105 -----------
 flang/lib/Optimizer/Dialect/FIROps.cpp        | 161 +++++++++++++++++
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     | 163 ------------------
 flang/test/{HLFIR => Fir}/do_concurrent.fir   |  26 +--
 flang/test/Fir/invalid.fir                    |  95 ++++++++++
 flang/test/HLFIR/invalid.fir                  |  95 ----------
 7 files changed, 374 insertions(+), 376 deletions(-)
 rename flang/test/{HLFIR => Fir}/do_concurrent.fir (84%)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index ee9b959ba570f..c8d8ab41552c2 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3446,4 +3446,109 @@ def fir_BoxTotalElementsOp
   let hasCanonicalizer = 1;
 }
 
+def fir_DoConcurrentOp : fir_Op<"do_concurrent",
+    [SingleBlock, AutomaticAllocationScope]> {
+  let summary = "do concurrent loop wrapper";
+
+  let description = [{
+    A wrapper operation for the actual op modeling `do concurrent` loops:
+    `fir.do_concurrent.loop` (see op declaration below for more info about it).
+
+    The `fir.do_concurrent` wrapper op consists of one single-block region with
+    the following properties:
+    - The first ops in the region are responsible for allocating storage for the
+      loop's iteration variables. This is property is **not** enforced by the op
+      verifier, but expected to be respected when building the op.
+    - The terminator of the region is an instance of `fir.do_concurrent.loop`.
+
+    For example, a 2D loop nest would be represented as follows:
+    ```
+    fir.do_concurrent {
+      %i = fir.alloca i32
+      %j = fir.alloca i32
+      fir.do_concurrent.loop ...
+    }
+    ```
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let assemblyFormat = "$region attr-dict";
+  let hasVerifier = 1;
+}
+
+def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
+    [AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     Terminator, NoTerminator, SingleBlock, ParentOneOf<["DoConcurrentOp"]>]> {
+  let summary = "do concurrent loop";
+
+  let description = [{
+    An operation that models a Fortran `do concurrent` loop's header and block.
+    This is a single-region single-block terminator op that is expected to
+    terminate the region of a `omp.do_concurrent` wrapper op.
+
+    This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
+    `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
+    represent the lower bounds, upper bounds, and steps. Similar to `fir.do_loop`
+    the op takes one additional group of SSA values to represent reductions.
+
+    The body region **does not** have a terminator.
+
+    For example, a 2D loop nest with 2 reductions (sum and max) would be
+    represented as follows:
+    ```
+    // The wrapper of the loop
+    fir.do_concurrent {
+      %i = fir.alloca i32
+      %j = fir.alloca i32
+
+      // The actual `do concurrent` loop
+      fir.do_concurrent.loop
+        (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st)
+        reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>,
+               #fir.reduce_attr<max> -> %max : !fir.ref<f32>) {
+
+        %0 = fir.convert %i_iv : (index) -> i32
+        fir.store %0 to %i : !fir.ref<i32>
+
+        %1 = fir.convert %j_iv : (index) -> i32
+        fir.store %1 to %j : !fir.ref<i32>
+
+        // ... loop body goes here ...
+      }
+    }
+    ```
+
+    Description of arguments:
+    - `lowerBound`: The group of SSA values for the nest's lower bounds.
+    - `upperBound`: The group of SSA values for the nest's upper bounds.
+    - `step`: The group of SSA values for the nest's steps.
+    - `reduceOperands`: The reduction SSA values, if any.
+    - `reduceAttrs`: Attributes to store reduction operations, if any.
+    - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
+      LLVM.
+  }];
+
+  let arguments = (ins
+    Variadic<Index>:$lowerBound,
+    Variadic<Index>:$upperBound,
+    Variadic<Index>:$step,
+    Variadic<AnyType>:$reduceOperands,
+    OptionalAttr<ArrayAttr>:$reduceAttrs,
+    OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
+  );
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    // Get Number of reduction operands
+    unsigned getNumReduceOperands() {
+      return getReduceOperands().size();
+    }
+  }];
+}
+
 #endif
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 8897961316abc..f69930d5b53b3 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -21,7 +21,6 @@ include "flang/Optimizer/Dialect/FIRAttr.td"
 include "flang/Optimizer/Dialect/FortranVariableInterface.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td"
-include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td"
 include "mlir/IR/BuiltinAttributes.td"
 
 // Base class for FIR operations.
@@ -1864,109 +1863,5 @@ def hlfir_EvaluateInMemoryOp : hlfir_Op<"eval_in_mem", [AttrSizedOperandSegments
   let hasVerifier = 1;
 }
 
-def hlfir_DoConcurrentOp : hlfir_Op<"do_concurrent", 
-    [SingleBlock, AutomaticAllocationScope]> {
-  let summary = "do concurrent loop wrapper";
-
-  let description = [{
-    A wrapper operation for the actual op modeling `do concurrent` loops:
-    `hlfir.do_concurrent.loop` (see op declaration below for more info about it).
-
-    The `hlfir.do_concurrent` wrapper op consists of one single-block region with
-    the following properties:
-    - The first ops in the region are responsible for allocating storage for the
-      loop's iteration variables. This is property is **not** enforced by the op
-      verifier, but expected to be respected when building the op.
-    - The terminator of the region is an instance of `hlfir.do_concurrent.loop`.
-
-    For example, a 2D loop nest would be represented as follows:
-    ```
-    hlfir.do_concurrent {
-      %i = fir.alloca i32
-      %j = fir.alloca i32
-      hlfir.do_concurrent.loop ...
-    }
-    ```
-  }];
-
-  let regions = (region SizedRegion<1>:$region);
-
-  let assemblyFormat = "$region attr-dict";
-  let hasVerifier = 1;
-}
-
-def hlfir_DoConcurrentLoopOp : hlfir_Op<"do_concurrent.loop",
-    [AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-     Terminator, NoTerminator, SingleBlock, ParentOneOf<["DoConcurrentOp"]>]> {
-  let summary = "do concurrent loop";
-
-  let description = [{
-    An operation that models a Fortran `do concurrent` loop's header and block.
-    This is a single-region single-block terminator op that is expected to
-    terminate the region of a `omp.do_concurrent` wrapper op.
-
-    This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
-    `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
-    represent the lower bounds, upper bounds, and steps. Similar to `fir.do_loop`
-    the op takes one additional group of SSA values to represent reductions.
-
-    The body region **does not** have a terminator.
-
-    For example, a 2D loop nest with 2 reductions (sum and max) would be
-    represented as follows:
-    ```
-    // The wrapper of the loop
-    hlfir.do_concurrent {
-      %i = fir.alloca i32
-      %j = fir.alloca i32
-
-      // The actual `do concurrent` loop
-      hlfir.do_concurrent.loop
-        (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st)
-        reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>,
-               #fir.reduce_attr<max> -> %max : !fir.ref<f32>) {
-
-        %0 = fir.convert %i_iv : (index) -> i32
-        fir.store %0 to %i : !fir.ref<i32>
-
-        %1 = fir.convert %j_iv : (index) -> i32
-        fir.store %1 to %j : !fir.ref<i32>
-
-        // ... loop body goes here ...
-      }
-    }
-    ```
-
-    Description of arguments:
-    - `lowerBound`: The group of SSA values for the nest's lower bounds.
-    - `upperBound`: The group of SSA values for the nest's upper bounds.
-    - `step`: The group of SSA values for the nest's steps.
-    - `reduceOperands`: The reduction SSA values, if any.
-    - `reduceAttrs`: Attributes to store reduction operations, if any.
-    - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
-      LLVM.
-  }];
-
-  let arguments = (ins
-    Variadic<Index>:$lowerBound,
-    Variadic<Index>:$upperBound,
-    Variadic<Index>:$step,
-    Variadic<AnyType>:$reduceOperands,
-    OptionalAttr<ArrayAttr>:$reduceAttrs,
-    OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
-  );
-
-  let regions = (region SizedRegion<1>:$region);
-
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
-
-  let extraClassDeclaration = [{
-    // Get Number of reduction operands
-    unsigned getNumReduceOperands() {
-      return getReduceOperands().size();
-    }
-  }];
-}
 
 #endif // FORTRAN_DIALECT_HLFIR_OPS
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 90202f3cee588..474577b986372 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4748,6 +4748,167 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
   patterns.add<SimplifyBoxTotalElementsOp>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// DoConcurrentOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult fir::DoConcurrentOp::verify() {
+  mlir::Block *body = getBody();
+
+  if (body->empty())
+    return emitOpError("body cannot be empty");
+
+  if (!body->mightHaveTerminator() ||
+      !mlir::isa<fir::DoConcurrentLoopOp>(body->getTerminator()))
+    return emitOpError("must be terminated by 'fir.do_concurrent.loop'");
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// DoConcurrentLoopOp
+//===----------------------------------------------------------------------===//
+
+mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
+                                                 mlir::OperationState &result) {
+  auto &builder = parser.getBuilder();
+  // Parse an opening `(` followed by induction variables followed by `)`
+  llvm::SmallVector<mlir::OpAsmParser::Argument, 4> ivs;
+  if (parser.parseArgumentList(ivs, mlir::OpAsmParser::Delimiter::Paren))
+    return mlir::failure();
+
+  // Parse loop bounds.
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> lower;
+  if (parser.parseEqual() ||
+      parser.parseOperandList(lower, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> upper;
+  if (parser.parseKeyword("to") ||
+      parser.parseOperandList(upper, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  // Parse step values.
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> steps;
+  if (parser.parseKeyword("step") ||
+      parser.parseOperandList(steps, ivs.size(),
+                              mlir::OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
+    return mlir::failure();
+
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands;
+  llvm::SmallVector<mlir::Type> reduceArgTypes;
+  if (succeeded(parser.parseOptionalKeyword("reduce"))) {
+    // Parse reduction attributes and variables.
+    llvm::SmallVector<fir::ReduceAttr> attributes;
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::Paren, [&]() {
+              if (parser.parseAttribute(attributes.emplace_back()) ||
+                  parser.parseArrow() ||
+                  parser.parseOperand(reduceOperands.emplace_back()) ||
+                  parser.parseColonType(reduceArgTypes.emplace_back()))
+                return mlir::failure();
+              return mlir::success();
+            })))
+      return mlir::failure();
+    // Resolve input operands.
+    for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes))
+      if (parser.resolveOperand(std::get<0>(operand_type),
+                                std::get<1>(operand_type), result.operands))
+        return mlir::failure();
+    llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                                 attributes.end());
+    result.addAttribute(getReduceAttrsAttrName(result.name),
+                        builder.getArrayAttr(arrayAttr));
+  }
+
+  // Now parse the body.
+  mlir::Region *body = result.addRegion();
+  for (auto &iv : ivs)
+    iv.type = builder.getIndexType();
+  if (parser.parseRegion(*body, ivs))
+    return mlir::failure();
+
+  // Set `operandSegmentSizes` attribute.
+  result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(lower.size()),
+                           static_cast<int32_t>(upper.size()),
+                           static_cast<int32_t>(steps.size()),
+                           static_cast<int32_t>(reduceOperands.size())}));
+
+  // Parse attributes.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return mlir::failure();
+
+  return mlir::success();
+}
+
+void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
+  p << " (" << getBody()->getArguments() << ") = (" << getLowerBound()
+    << ") to (" << getUpperBound() << ") step (" << getStep() << ")";
+
+  if (!getReduceOperands().empty()) {
+    p << " reduce(";
+    auto attrs = getReduceAttrsAttr();
+    auto operands = getReduceOperands();
+    llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) {
+      p << std::get<0>(it) << " -> " << std::get<1>(it) << " : "
+        << std::get<1>(it).getType();
+    });
+    p << ')';
+  }
+
+  p << ' ';
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(
+      (*this)->getAttrs(),
+      /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+                       DoConcurrentLoopOp::getReduceAttrsAttrName()});
+}
+
+llvm::SmallVector<mlir::Region *> fir::DoConcurrentLoopOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+llvm::LogicalResult fir::DoConcurrentLoopOp::verify() {
+  mlir::Operation::operand_range lbValues = getLowerBound();
+  mlir::Operation::operand_range ubValues = getUpperBound();
+  mlir::Operation::operand_range stepValues = getStep();
+
+  if (lbValues.empty())
+    return emitOpError(
+        "needs at least one tuple element for lowerBound, upperBound and step");
+
+  if (lbValues.size() != ubValues.size() ||
+      ubValues.size() != stepValues.size())
+    return emitOpError("different number of tuple elements for lowerBound, "
+                       "upperBound or step");
+
+  // Check that the body defines the same number of block arguments as the
+  // number of tuple elements in step.
+  mlir::Block *body = getBody();
+  if (body->getNumArguments() != stepValues.size())
+    return emitOpError() << "expects the same number of induction variables: "
+                         << body->getNumArguments()
+                         << " as bound and step values: " << stepValues.size();
+  for (auto arg : body->getArguments())
+    if (!arg.getType().isIndex())
+      return emitOpError(
+          "expects arguments for the induction variable to be of index type");
+
+  auto reduceAttrs = getReduceAttrsAttr();
+  if (getNumReduceOperands() != (reduceAttrs ? reduceAttrs.size() : 0))
+    return emitOpError(
+        "mismatch in number of reduction variables and reduction attributes");
+
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 4c19ea414aabb..8851a3a7187b9 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -12,7 +12,6 @@
 
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 
-#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
@@ -2247,168 +2246,6 @@ llvm::LogicalResult hlfir::EvaluateInMemoryOp::verify() {
   return mlir::success();
 }
 
-//===----------------------------------------------------------------------===//
-// DoConcurrentOp
-//===----------------------------------------------------------------------===//
-
-llvm::LogicalResult hlfir::DoConcurrentOp::verify() {
-  mlir::Block *body = getBody();
-
-  if (body->empty())
-    return emitOpError("body cannot be empty");
-
-  if (!body->mightHaveTerminator() ||
-      !mlir::isa<hlfir::DoConcurrentLoopOp>(body->getTerminator()))
-    return emitOpError("must be terminated by 'hlfir.do_concurrent.loop'");
-
-  return mlir::success();
-}
-
-//===----------------------------------------------------------------------===//
-// DoConcurrentLoopOp
-//===----------------------------------------------------------------------===//
-
-mlir::ParseResult
-hlfir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
-                                 mlir::OperationState &result) {
-  auto &builder = parser.getBuilder();
-  // Parse an opening `(` followed by induction variables followed by `)`
-  llvm::SmallVector<mlir::OpAsmParser::Argument, 4> ivs;
-  if (parser.parseArgumentList(ivs, mlir::OpAsmParser::Delimiter::Paren))
-    return mlir::failure();
-
-  // Parse loop bounds.
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> lower;
-  if (parser.parseEqual() ||
-      parser.parseOperandList(lower, ivs.size(),
-                              mlir::OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
-    return mlir::failure();
-
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> upper;
-  if (parser.parseKeyword("to") ||
-      parser.parseOperandList(upper, ivs.size(),
-                              mlir::OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
-    return mlir::failure();
-
-  // Parse step values.
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> steps;
-  if (parser.parseKeyword("step") ||
-      parser.parseOperandList(steps, ivs.size(),
-                              mlir::OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
-    return mlir::failure();
-
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands;
-  llvm::SmallVector<mlir::Type> reduceArgTypes;
-  if (succeeded(parser.parseOptionalKeyword("reduce"))) {
-    // Parse reduction attributes and variables.
-    llvm::SmallVector<fir::ReduceAttr> attributes;
-    if (failed(parser.parseCommaSeparatedList(
-            mlir::AsmParser::Delimiter::Paren, [&]() {
-              if (parser.parseAttribute(attributes.emplace_back()) ||
-                  parser.parseArrow() ||
-                  parser.parseOperand(reduceOperands.emplace_back()) ||
-                  parser.parseColonType(reduceArgTypes.emplace_back()))
-                return mlir::failure();
-              return mlir::success();
-            })))
-      return mlir::failure();
-    // Resolve input operands.
-    for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes))
-      if (parser.resolveOperand(std::get<0>(operand_type),
-                                std::get<1>(operand_type), result.operands))
-        return mlir::failure();
-    llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
-                                                 attributes.end());
-    result.addAttribute(getReduceAttrsAttrName(result.name),
-                        builder.getArrayAttr(arrayAttr));
-  }
-
-  // Now parse the body.
-  mlir::Region *body = result.addRegion();
-  for (auto &iv : ivs)
-    iv.type = builder.getIndexType();
-  if (parser.parseRegion(*body, ivs))
-    return mlir::failure();
-
-  // Set `operandSegmentSizes` attribute.
-  result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr(
-                          {static_cast<int32_t>(lower.size()),
-                           static_cast<int32_t>(upper.size()),
-                           static_cast<int32_t>(steps.size()),
-                           static_cast<int32_t>(reduceOperands.size())}));
-
-  // Parse attributes.
-  if (parser.parseOptionalAttrDict(result.attributes))
-    return mlir::failure();
-
-  return mlir::success();
-}
-
-void hlfir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
-  p << " (" << getBody()->getArguments() << ") = (" << getLowerBound()
-    << ") to (" << getUpperBound() << ") step (" << getStep() << ")";
-
-  if (!getReduceOperands().empty()) {
-    p << " reduce(";
-    auto attrs = getReduceAttrsAttr();
-    auto operands = getReduceOperands();
-    llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) {
-      p << std::get<0>(it) << " -> " << std::get<1>(it) << " : "
-        << std::get<1>(it).getType();
-    });
-    p << ')';
-  }
-
-  p << ' ';
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict(
-      (*this)->getAttrs(),
-      /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
-                       DoConcurrentLoopOp::getReduceAttrsAttrName()});
-}
-
-llvm::SmallVector<mlir::Region *> hlfir::DoConcurrentLoopOp::getLoopRegions() {
-  return {&getRegion()};
-}
-
-llvm::LogicalResult hlfir::DoConcurrentLoopOp::verify() {
-  mlir::Operation::operand_range lbValues = getLowerBound();
-  mlir::Operation::operand_range ubValues = getUpperBound();
-  mlir::Operation::operand_range stepValues = getStep();
-
-  if (lbValues.empty())
-    return emitOpError(
-        "needs at least one tuple element for lowerBound, upperBound and step");
-
-  if (lbValues.size() != ubValues.size() ||
-      ubValues.size() != stepValues.size())
-    return emitOpError("different number of tuple elements for lowerBound, "
-                       "upperBound or step");
-
-  // Check that the body defines the same number of block arguments as the
-  // number of tuple elements in step.
-  mlir::Block *body = getBody();
-  if (body->getNumArguments() != stepValues.size())
-    return emitOpError() << "expects the same number of induction variables: "
-                         << body->getNumArguments()
-                         << " as bound and step values: " << stepValues.size();
-  for (auto arg : body->getArguments())
-    if (!arg.getType().isIndex())
-      return emitOpError(
-          "expects arguments for the induction variable to be of index type");
-
-  auto reduceAttrs = getReduceAttrsAttr();
-  if (getNumReduceOperands() != (reduceAttrs ? reduceAttrs.size() : 0))
-    return emitOpError(
-        "mismatch in number of reduction variables and reduction attributes");
-
-  return mlir::success();
-}
-
 #include "flang/Optimizer/HLFIR/HLFIROpInterfaces.cpp.inc"
 #define GET_OP_CLASSES
 #include "flang/Optimizer/HLFIR/HLFIREnums.cpp.inc"
diff --git a/flang/test/HLFIR/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir
similarity index 84%
rename from flang/test/HLFIR/do_concurrent.fir
rename to flang/test/Fir/do_concurrent.fir
index aef9db2236a57..8e80ffb9c7b0b 100644
--- a/flang/test/HLFIR/do_concurrent.fir
+++ b/flang/test/Fir/do_concurrent.fir
@@ -1,11 +1,11 @@
-// Test hlfir.do_concurrent operation parse, verify (no errors), and unparse
+// Test fir.do_concurrent operation parse, verify (no errors), and unparse
 
 // RUN: fir-opt %s | fir-opt | FileCheck %s
 
 func.func @dc_1d(%i_lb: index, %i_ub: index, %i_st: index) {
-  hlfir.do_concurrent {
+  fir.do_concurrent {
     %i = fir.alloca i32
-    hlfir.do_concurrent.loop (%i_iv) = (%i_lb) to (%i_ub) step (%i_st) {
+    fir.do_concurrent.loop (%i_iv) = (%i_lb) to (%i_ub) step (%i_st) {
       %0 = fir.convert %i_iv : (index) -> i32
       fir.store %0 to %i : !fir.ref<i32>
     }
@@ -15,9 +15,9 @@ func.func @dc_1d(%i_lb: index, %i_ub: index, %i_st: index) {
 
 // CHECK-LABEL: func.func @dc_1d
 // CHECK-SAME:    (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index)
-// CHECK:         hlfir.do_concurrent {
+// CHECK:         fir.do_concurrent {
 // CHECK:           %[[I:.*]] = fir.alloca i32
-// CHECK:           hlfir.do_concurrent.loop (%[[I_IV:.*]]) = (%[[I_LB]]) to (%[[I_UB]]) step (%[[I_ST]]) {
+// CHECK:           fir.do_concurrent.loop (%[[I_IV:.*]]) = (%[[I_LB]]) to (%[[I_UB]]) step (%[[I_ST]]) {
 // CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
 // CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
 // CHECK:           }
@@ -25,10 +25,10 @@ func.func @dc_1d(%i_lb: index, %i_ub: index, %i_st: index) {
 
 func.func @dc_2d(%i_lb: index, %i_ub: index, %i_st: index,
                  %j_lb: index, %j_ub: index, %j_st: index) {
-  hlfir.do_concurrent {
+  fir.do_concurrent {
     %i = fir.alloca i32
     %j = fir.alloca i32
-    hlfir.do_concurrent.loop
+    fir.do_concurrent.loop
       (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) {
       %0 = fir.convert %i_iv : (index) -> i32
       fir.store %0 to %i : !fir.ref<i32>
@@ -42,10 +42,10 @@ func.func @dc_2d(%i_lb: index, %i_ub: index, %i_st: index,
 
 // CHECK-LABEL: func.func @dc_2d
 // CHECK-SAME:    (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index, %[[J_LB:.*]]: index, %[[J_UB:.*]]: index, %[[J_ST:.*]]: index)
-// CHECK:         hlfir.do_concurrent {
+// CHECK:         fir.do_concurrent {
 // CHECK:           %[[I:.*]] = fir.alloca i32
 // CHECK:           %[[J:.*]] = fir.alloca i32
-// CHECK:           hlfir.do_concurrent.loop
+// CHECK:           fir.do_concurrent.loop
 // CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) {
 // CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
 // CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
@@ -58,10 +58,10 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
                  %j_lb: index, %j_ub: index, %j_st: index) {
   %sum = fir.alloca i32
 
-  hlfir.do_concurrent {
+  fir.do_concurrent {
     %i = fir.alloca i32
     %j = fir.alloca i32
-    hlfir.do_concurrent.loop
+    fir.do_concurrent.loop
       (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) 
       reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) {
       %0 = fir.convert %i_iv : (index) -> i32
@@ -79,10 +79,10 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
 
 // CHECK:         %[[SUM:.*]] = fir.alloca i32
 
-// CHECK:         hlfir.do_concurrent {
+// CHECK:         fir.do_concurrent {
 // CHECK:           %[[I:.*]] = fir.alloca i32
 // CHECK:           %[[J:.*]] = fir.alloca i32
-// CHECK:           hlfir.do_concurrent.loop
+// CHECK:           fir.do_concurrent.loop
 // CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(#fir.reduce_attr<add> -> %[[SUM]] : !fir.ref<i32>) {
 // CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
 // CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index d5db644eeddb2..88906890a9237 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1162,3 +1162,98 @@ func.func @bad_box_total_elements(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>)
   %0 = fir.box_total_elements %arg0 : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> i32
   return %0 : i32
 }
+
+// -----
+
+func.func @empty_dc_wrapper_body() {
+  // expected-error at +1 {{'fir.do_concurrent' op expects a non-empty block}}
+  fir.do_concurrent {
+  }
+  return
+}
+
+// -----
+
+func.func @dc_wrong_terminator() {
+  // expected-error at +1 {{'fir.do_concurrent' op must be terminated by 'fir.do_concurrent.loop'}}
+  fir.do_concurrent {
+    llvm.return
+  }
+  return
+}
+
+// -----
+
+func.func @dc_0d() {
+   // expected-error at +2 {{'fir.do_concurrent.loop' op needs at least one tuple element for lowerBound, upperBound and step}}
+  fir.do_concurrent {
+    fir.do_concurrent.loop () = () to () step () {
+      %tmp = fir.alloca i32
+    }
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_parent(%arg0: index, %arg1: index) {
+  // expected-error at +1 {{'fir.do_concurrent.loop' op expects parent op 'fir.do_concurrent'}}
+  "fir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+  ^bb0(%arg2: index):
+     %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+  }) : (index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @dc_invalid_control(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'fir.do_concurrent.loop' op different number of tuple elements for lowerBound, upperBound or step}}
+  fir.do_concurrent {
+    "fir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+    ^bb0(%arg2: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'fir.do_concurrent.loop' op expects the same number of induction variables: 2 as bound and step values: 1}}
+  fir.do_concurrent {
+    "fir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+    ^bb0(%arg3: index, %arg4: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var_type(%arg0: index, %arg1: index) {
+  // expected-error at +2 {{'fir.do_concurrent.loop' op expects arguments for the induction variable to be of index type}}
+  fir.do_concurrent {
+    "fir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+    ^bb0(%arg3: i32):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
+  %sum = fir.alloca i32
+  // expected-error at +2 {{'fir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}}
+  fir.do_concurrent {
+    "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>}> ({
+    ^bb0(%arg3: index):
+      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+    }) : (index, index, index, !fir.ref<i32>) -> ()
+  }
+  return
+}
diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir
index e14284f916bd9..d61efe0062e69 100644
--- a/flang/test/HLFIR/invalid.fir
+++ b/flang/test/HLFIR/invalid.fir
@@ -1555,98 +1555,3 @@ func.func @bad_reshape(%arg0: !hlfir.expr<1x!fir.char<1,2>>, %arg1: !hlfir.expr<
   %0 = hlfir.reshape %arg0 %arg1 pad %arg2 : (!hlfir.expr<1x!fir.char<1,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2,?>>) -> !hlfir.expr<?x!fir.char<1,?>>
   return
 }
-
-// -----
-
-func.func @empty_dc_wrapper_body() {
-  // expected-error at +1 {{'hlfir.do_concurrent' op expects a non-empty block}}
-  hlfir.do_concurrent {
-  }
-  return
-}
-
-// -----
-
-func.func @dc_wrong_terminator() {
-  // expected-error at +1 {{'hlfir.do_concurrent' op must be terminated by 'hlfir.do_concurrent.loop'}}
-  hlfir.do_concurrent {
-    llvm.return
-  }
-  return
-}
-
-// -----
-
-func.func @dc_0d() {
-   // expected-error at +2 {{'hlfir.do_concurrent.loop' op needs at least one tuple element for lowerBound, upperBound and step}}
-  hlfir.do_concurrent {
-    hlfir.do_concurrent.loop () = () to () step () {
-      %tmp = fir.alloca i32
-    }
-  }
-  return
-}
-
-// -----
-
-func.func @dc_invalid_parent(%arg0: index, %arg1: index) {
-  // expected-error at +1 {{'hlfir.do_concurrent.loop' op expects parent op 'hlfir.do_concurrent'}}
-  "hlfir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
-  ^bb0(%arg2: index):
-     %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
-  }) : (index, index) -> ()
-  return
-}
-
-// -----
-
-func.func @dc_invalid_control(%arg0: index, %arg1: index) {
-  // expected-error at +2 {{'hlfir.do_concurrent.loop' op different number of tuple elements for lowerBound, upperBound or step}}
-  hlfir.do_concurrent {
-    "hlfir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
-    ^bb0(%arg2: index):
-      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
-    }) : (index, index) -> ()
-  }
-  return
-}
-
-// -----
-
-func.func @dc_invalid_ind_var(%arg0: index, %arg1: index) {
-  // expected-error at +2 {{'hlfir.do_concurrent.loop' op expects the same number of induction variables: 2 as bound and step values: 1}}
-  hlfir.do_concurrent {
-    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
-    ^bb0(%arg3: index, %arg4: index):
-      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
-    }) : (index, index, index) -> ()
-  }
-  return
-}
-
-// -----
-
-func.func @dc_invalid_ind_var_type(%arg0: index, %arg1: index) {
-  // expected-error at +2 {{'hlfir.do_concurrent.loop' op expects arguments for the induction variable to be of index type}}
-  hlfir.do_concurrent {
-    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
-    ^bb0(%arg3: i32):
-      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
-    }) : (index, index, index) -> ()
-  }
-  return
-}
-
-// -----
-
-func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
-  %sum = fir.alloca i32
-  // expected-error at +2 {{'hlfir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}}
-  hlfir.do_concurrent {
-    "hlfir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>}> ({
-    ^bb0(%arg3: index):
-      %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
-    }) : (index, index, index, !fir.ref<i32>) -> ()
-  }
-  return
-}