[flang-commits] [flang] 1094ffc - [flang][fir] Add MLIR op for `do concurrent` (#130893)
via flang-commits
flang-commits at lists.llvm.org
Tue Mar 18 02:53:47 PDT 2025
Author: Kareem Ergawy
Date: 2025-03-18T10:53:44+01:00
New Revision: 1094ffcafbc909cf4caecaf331e97ef16250e42d
URL: https://github.com/llvm/llvm-project/commit/1094ffcafbc909cf4caecaf331e97ef16250e42d
DIFF: https://github.com/llvm/llvm-project/commit/1094ffcafbc909cf4caecaf331e97ef16250e42d.diff
LOG: [flang][fir] Add MLIR op for `do concurrent` (#130893)
Adds new MLIR ops to model `do concurrent`. In order to make `do
concurrent` representation self-contained, a loop is modeled using 2
ops, one wrapper and one that contains the actual body of the loop. For
example, a 2D `do concurrent` loop is modeled as follows:
```mlir
fir.do_concurrent {
%i = fir.alloca i32
%j = fir.alloca i32
fir.do_concurrent.loop
(%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) {
%0 = fir.convert %i_iv : (index) -> i32
fir.store %0 to %i : !fir.ref<i32>
%1 = fir.convert %j_iv : (index) -> i32
fir.store %1 to %j : !fir.ref<i32>
}
}
```
The `fir.do_concurrent` wrapper op encapsulates both the actual loop and
the allocations required for the iteration variables. The
`fir.do_concurrent.loop` op is a multi-dimensional op that contains the
loop control and body. See the ops' docs for more info.
Added:
flang/test/Fir/do_concurrent.fir
Modified:
flang/include/flang/Optimizer/Dialect/FIROps.td
flang/lib/Optimizer/Dialect/FIROps.cpp
flang/test/Fir/invalid.fir
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index ee9b959ba570f..c8d8ab41552c2 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3446,4 +3446,109 @@ def fir_BoxTotalElementsOp
let hasCanonicalizer = 1;
}
+def fir_DoConcurrentOp : fir_Op<"do_concurrent",
+ [SingleBlock, AutomaticAllocationScope]> {
+ let summary = "do concurrent loop wrapper";
+
+ let description = [{
+ A wrapper operation for the actual op modeling `do concurrent` loops:
+ `fir.do_concurrent.loop` (see op declaration below for more info about it).
+
+ The `fir.do_concurrent` wrapper op consists of one single-block region with
+ the following properties:
+ - The first ops in the region are responsible for allocating storage for the
+ loop's iteration variables. This is property is **not** enforced by the op
+ verifier, but expected to be respected when building the op.
+ - The terminator of the region is an instance of `fir.do_concurrent.loop`.
+
+ For example, a 2D loop nest would be represented as follows:
+ ```
+ fir.do_concurrent {
+ %i = fir.alloca i32
+ %j = fir.alloca i32
+ fir.do_concurrent.loop ...
+ }
+ ```
+ }];
+
+ let regions = (region SizedRegion<1>:$region);
+
+ let assemblyFormat = "$region attr-dict";
+ let hasVerifier = 1;
+}
+
+def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
+ [AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+ Terminator, NoTerminator, SingleBlock, ParentOneOf<["DoConcurrentOp"]>]> {
+ let summary = "do concurrent loop";
+
+ let description = [{
+ An operation that models a Fortran `do concurrent` loop's header and block.
+ This is a single-region single-block terminator op that is expected to
+ terminate the region of a `omp.do_concurrent` wrapper op.
+
+ This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
+ `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
+ represent the lower bounds, upper bounds, and steps. Similar to `fir.do_loop`
+ the op takes one additional group of SSA values to represent reductions.
+
+ The body region **does not** have a terminator.
+
+ For example, a 2D loop nest with 2 reductions (sum and max) would be
+ represented as follows:
+ ```
+ // The wrapper of the loop
+ fir.do_concurrent {
+ %i = fir.alloca i32
+ %j = fir.alloca i32
+
+ // The actual `do concurrent` loop
+ fir.do_concurrent.loop
+ (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st)
+ reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>,
+ #fir.reduce_attr<max> -> %max : !fir.ref<f32>) {
+
+ %0 = fir.convert %i_iv : (index) -> i32
+ fir.store %0 to %i : !fir.ref<i32>
+
+ %1 = fir.convert %j_iv : (index) -> i32
+ fir.store %1 to %j : !fir.ref<i32>
+
+ // ... loop body goes here ...
+ }
+ }
+ ```
+
+ Description of arguments:
+ - `lowerBound`: The group of SSA values for the nest's lower bounds.
+ - `upperBound`: The group of SSA values for the nest's upper bounds.
+ - `step`: The group of SSA values for the nest's steps.
+ - `reduceOperands`: The reduction SSA values, if any.
+ - `reduceAttrs`: Attributes to store reduction operations, if any.
+ - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
+ LLVM.
+ }];
+
+ let arguments = (ins
+ Variadic<Index>:$lowerBound,
+ Variadic<Index>:$upperBound,
+ Variadic<Index>:$step,
+ Variadic<AnyType>:$reduceOperands,
+ OptionalAttr<ArrayAttr>:$reduceAttrs,
+ OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
+ );
+
+ let regions = (region SizedRegion<1>:$region);
+
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+
+ let extraClassDeclaration = [{
+ // Get Number of reduction operands
+ unsigned getNumReduceOperands() {
+ return getReduceOperands().size();
+ }
+ }];
+}
+
#endif
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 90202f3cee588..474577b986372 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4748,6 +4748,167 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
patterns.add<SimplifyBoxTotalElementsOp>(context);
}
+//===----------------------------------------------------------------------===//
+// DoConcurrentOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult fir::DoConcurrentOp::verify() {
+ mlir::Block *body = getBody();
+
+ if (body->empty())
+ return emitOpError("body cannot be empty");
+
+ if (!body->mightHaveTerminator() ||
+ !mlir::isa<fir::DoConcurrentLoopOp>(body->getTerminator()))
+ return emitOpError("must be terminated by 'fir.do_concurrent.loop'");
+
+ return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// DoConcurrentLoopOp
+//===----------------------------------------------------------------------===//
+
+mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
+ mlir::OperationState &result) {
+ auto &builder = parser.getBuilder();
+ // Parse an opening `(` followed by induction variables followed by `)`
+ llvm::SmallVector<mlir::OpAsmParser::Argument, 4> ivs;
+ if (parser.parseArgumentList(ivs, mlir::OpAsmParser::Delimiter::Paren))
+ return mlir::failure();
+
+ // Parse loop bounds.
+ llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> lower;
+ if (parser.parseEqual() ||
+ parser.parseOperandList(lower, ivs.size(),
+ mlir::OpAsmParser::Delimiter::Paren) ||
+ parser.resolveOperands(lower, builder.getIndexType(), result.operands))
+ return mlir::failure();
+
+ llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> upper;
+ if (parser.parseKeyword("to") ||
+ parser.parseOperandList(upper, ivs.size(),
+ mlir::OpAsmParser::Delimiter::Paren) ||
+ parser.resolveOperands(upper, builder.getIndexType(), result.operands))
+ return mlir::failure();
+
+ // Parse step values.
+ llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> steps;
+ if (parser.parseKeyword("step") ||
+ parser.parseOperandList(steps, ivs.size(),
+ mlir::OpAsmParser::Delimiter::Paren) ||
+ parser.resolveOperands(steps, builder.getIndexType(), result.operands))
+ return mlir::failure();
+
+ llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands;
+ llvm::SmallVector<mlir::Type> reduceArgTypes;
+ if (succeeded(parser.parseOptionalKeyword("reduce"))) {
+ // Parse reduction attributes and variables.
+ llvm::SmallVector<fir::ReduceAttr> attributes;
+ if (failed(parser.parseCommaSeparatedList(
+ mlir::AsmParser::Delimiter::Paren, [&]() {
+ if (parser.parseAttribute(attributes.emplace_back()) ||
+ parser.parseArrow() ||
+ parser.parseOperand(reduceOperands.emplace_back()) ||
+ parser.parseColonType(reduceArgTypes.emplace_back()))
+ return mlir::failure();
+ return mlir::success();
+ })))
+ return mlir::failure();
+ // Resolve input operands.
+ for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes))
+ if (parser.resolveOperand(std::get<0>(operand_type),
+ std::get<1>(operand_type), result.operands))
+ return mlir::failure();
+ llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+ attributes.end());
+ result.addAttribute(getReduceAttrsAttrName(result.name),
+ builder.getArrayAttr(arrayAttr));
+ }
+
+ // Now parse the body.
+ mlir::Region *body = result.addRegion();
+ for (auto &iv : ivs)
+ iv.type = builder.getIndexType();
+ if (parser.parseRegion(*body, ivs))
+ return mlir::failure();
+
+ // Set `operandSegmentSizes` attribute.
+ result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+ builder.getDenseI32ArrayAttr(
+ {static_cast<int32_t>(lower.size()),
+ static_cast<int32_t>(upper.size()),
+ static_cast<int32_t>(steps.size()),
+ static_cast<int32_t>(reduceOperands.size())}));
+
+ // Parse attributes.
+ if (parser.parseOptionalAttrDict(result.attributes))
+ return mlir::failure();
+
+ return mlir::success();
+}
+
+void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
+ p << " (" << getBody()->getArguments() << ") = (" << getLowerBound()
+ << ") to (" << getUpperBound() << ") step (" << getStep() << ")";
+
+ if (!getReduceOperands().empty()) {
+ p << " reduce(";
+ auto attrs = getReduceAttrsAttr();
+ auto operands = getReduceOperands();
+ llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) {
+ p << std::get<0>(it) << " -> " << std::get<1>(it) << " : "
+ << std::get<1>(it).getType();
+ });
+ p << ')';
+ }
+
+ p << ' ';
+ p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+ p.printOptionalAttrDict(
+ (*this)->getAttrs(),
+ /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+ DoConcurrentLoopOp::getReduceAttrsAttrName()});
+}
+
+llvm::SmallVector<mlir::Region *> fir::DoConcurrentLoopOp::getLoopRegions() {
+ return {&getRegion()};
+}
+
+llvm::LogicalResult fir::DoConcurrentLoopOp::verify() {
+ mlir::Operation::operand_range lbValues = getLowerBound();
+ mlir::Operation::operand_range ubValues = getUpperBound();
+ mlir::Operation::operand_range stepValues = getStep();
+
+ if (lbValues.empty())
+ return emitOpError(
+ "needs at least one tuple element for lowerBound, upperBound and step");
+
+ if (lbValues.size() != ubValues.size() ||
+ ubValues.size() != stepValues.size())
+ return emitOpError("
diff erent number of tuple elements for lowerBound, "
+ "upperBound or step");
+
+ // Check that the body defines the same number of block arguments as the
+ // number of tuple elements in step.
+ mlir::Block *body = getBody();
+ if (body->getNumArguments() != stepValues.size())
+ return emitOpError() << "expects the same number of induction variables: "
+ << body->getNumArguments()
+ << " as bound and step values: " << stepValues.size();
+ for (auto arg : body->getArguments())
+ if (!arg.getType().isIndex())
+ return emitOpError(
+ "expects arguments for the induction variable to be of index type");
+
+ auto reduceAttrs = getReduceAttrsAttr();
+ if (getNumReduceOperands() != (reduceAttrs ? reduceAttrs.size() : 0))
+ return emitOpError(
+ "mismatch in number of reduction variables and reduction attributes");
+
+ return mlir::success();
+}
+
//===----------------------------------------------------------------------===//
// FIROpsDialect
//===----------------------------------------------------------------------===//
diff --git a/flang/test/Fir/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir
new file mode 100644
index 0000000000000..8e80ffb9c7b0b
--- /dev/null
+++ b/flang/test/Fir/do_concurrent.fir
@@ -0,0 +1,92 @@
+// Test fir.do_concurrent operation parse, verify (no errors), and unparse
+
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+func.func @dc_1d(%i_lb: index, %i_ub: index, %i_st: index) {
+ fir.do_concurrent {
+ %i = fir.alloca i32
+ fir.do_concurrent.loop (%i_iv) = (%i_lb) to (%i_ub) step (%i_st) {
+ %0 = fir.convert %i_iv : (index) -> i32
+ fir.store %0 to %i : !fir.ref<i32>
+ }
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @dc_1d
+// CHECK-SAME: (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index)
+// CHECK: fir.do_concurrent {
+// CHECK: %[[I:.*]] = fir.alloca i32
+// CHECK: fir.do_concurrent.loop (%[[I_IV:.*]]) = (%[[I_LB]]) to (%[[I_UB]]) step (%[[I_ST]]) {
+// CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK: fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK: }
+// CHECK: }
+
+func.func @dc_2d(%i_lb: index, %i_ub: index, %i_st: index,
+ %j_lb: index, %j_ub: index, %j_st: index) {
+ fir.do_concurrent {
+ %i = fir.alloca i32
+ %j = fir.alloca i32
+ fir.do_concurrent.loop
+ (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) {
+ %0 = fir.convert %i_iv : (index) -> i32
+ fir.store %0 to %i : !fir.ref<i32>
+
+ %1 = fir.convert %j_iv : (index) -> i32
+ fir.store %1 to %j : !fir.ref<i32>
+ }
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @dc_2d
+// CHECK-SAME: (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index, %[[J_LB:.*]]: index, %[[J_UB:.*]]: index, %[[J_ST:.*]]: index)
+// CHECK: fir.do_concurrent {
+// CHECK: %[[I:.*]] = fir.alloca i32
+// CHECK: %[[J:.*]] = fir.alloca i32
+// CHECK: fir.do_concurrent.loop
+// CHECK-SAME: (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) {
+// CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK: fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+// CHECK: fir.store %[[J_IV_CVT]] to %[[J]] : !fir.ref<i32>
+// CHECK: }
+// CHECK: }
+
+func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
+ %j_lb: index, %j_ub: index, %j_st: index) {
+ %sum = fir.alloca i32
+
+ fir.do_concurrent {
+ %i = fir.alloca i32
+ %j = fir.alloca i32
+ fir.do_concurrent.loop
+ (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st)
+ reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) {
+ %0 = fir.convert %i_iv : (index) -> i32
+ fir.store %0 to %i : !fir.ref<i32>
+
+ %1 = fir.convert %j_iv : (index) -> i32
+ fir.store %1 to %j : !fir.ref<i32>
+ }
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @dc_2d_reduction
+// CHECK-SAME: (%[[I_LB:.*]]: index, %[[I_UB:.*]]: index, %[[I_ST:.*]]: index, %[[J_LB:.*]]: index, %[[J_UB:.*]]: index, %[[J_ST:.*]]: index)
+
+// CHECK: %[[SUM:.*]] = fir.alloca i32
+
+// CHECK: fir.do_concurrent {
+// CHECK: %[[I:.*]] = fir.alloca i32
+// CHECK: %[[J:.*]] = fir.alloca i32
+// CHECK: fir.do_concurrent.loop
+// CHECK-SAME: (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(#fir.reduce_attr<add> -> %[[SUM]] : !fir.ref<i32>) {
+// CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+// CHECK: fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
+// CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+// CHECK: fir.store %[[J_IV_CVT]] to %[[J]] : !fir.ref<i32>
+// CHECK: }
+// CHECK: }
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index d5db644eeddb2..88906890a9237 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1162,3 +1162,98 @@ func.func @bad_box_total_elements(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>)
%0 = fir.box_total_elements %arg0 : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> i32
return %0 : i32
}
+
+// -----
+
+func.func @empty_dc_wrapper_body() {
+ // expected-error at +1 {{'fir.do_concurrent' op expects a non-empty block}}
+ fir.do_concurrent {
+ }
+ return
+}
+
+// -----
+
+func.func @dc_wrong_terminator() {
+ // expected-error at +1 {{'fir.do_concurrent' op must be terminated by 'fir.do_concurrent.loop'}}
+ fir.do_concurrent {
+ llvm.return
+ }
+ return
+}
+
+// -----
+
+func.func @dc_0d() {
+ // expected-error at +2 {{'fir.do_concurrent.loop' op needs at least one tuple element for lowerBound, upperBound and step}}
+ fir.do_concurrent {
+ fir.do_concurrent.loop () = () to () step () {
+ %tmp = fir.alloca i32
+ }
+ }
+ return
+}
+
+// -----
+
+func.func @dc_invalid_parent(%arg0: index, %arg1: index) {
+ // expected-error at +1 {{'fir.do_concurrent.loop' op expects parent op 'fir.do_concurrent'}}
+ "fir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+ ^bb0(%arg2: index):
+ %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+ }) : (index, index) -> ()
+ return
+}
+
+// -----
+
+func.func @dc_invalid_control(%arg0: index, %arg1: index) {
+ // expected-error at +2 {{'fir.do_concurrent.loop' op
diff erent number of tuple elements for lowerBound, upperBound or step}}
+ fir.do_concurrent {
+ "fir.do_concurrent.loop"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>}> ({
+ ^bb0(%arg2: index):
+ %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+ }) : (index, index) -> ()
+ }
+ return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var(%arg0: index, %arg1: index) {
+ // expected-error at +2 {{'fir.do_concurrent.loop' op expects the same number of induction variables: 2 as bound and step values: 1}}
+ fir.do_concurrent {
+ "fir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+ ^bb0(%arg3: index, %arg4: index):
+ %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+ }) : (index, index, index) -> ()
+ }
+ return
+}
+
+// -----
+
+func.func @dc_invalid_ind_var_type(%arg0: index, %arg1: index) {
+ // expected-error at +2 {{'fir.do_concurrent.loop' op expects arguments for the induction variable to be of index type}}
+ fir.do_concurrent {
+ "fir.do_concurrent.loop"(%arg0, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>}> ({
+ ^bb0(%arg3: i32):
+ %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+ }) : (index, index, index) -> ()
+ }
+ return
+}
+
+// -----
+
+func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
+ %sum = fir.alloca i32
+ // expected-error at +2 {{'fir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}}
+ fir.do_concurrent {
+ "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 1>}> ({
+ ^bb0(%arg3: index):
+ %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
+ }) : (index, index, index, !fir.ref<i32>) -> ()
+ }
+ return
+}
More information about the flang-commits
mailing list