[flang-commits] [flang] [flang] Add reductions for CUF Kernels: Lowering (PR #95184)
Iman Hosseini via flang-commits
flang-commits at lists.llvm.org
Tue Jun 11 17:09:12 PDT 2024
https://github.com/ImanHosseini created https://github.com/llvm/llvm-project/pull/95184
* Add reductionOperands and reductionAttrs to cuf's KernelOp.
* Parsing is already working and the tree has the info: here I make the Bridge emit the updated KernelOp with reduction information added.
* Check |reductionAttrs| = |reductionOperands| in verifier
* Add a test
@clementval @vzakhari
>From d19e4cbb86e1f979de67a644e0de0850b5c02fce Mon Sep 17 00:00:00 2001
From: Iman Hosseini <imanh at nvidia.com>
Date: Tue, 11 Jun 2024 17:05:40 -0700
Subject: [PATCH] Support reductions in CUF Kernels: Lowering
---
.../flang/Optimizer/Dialect/CUF/CUFOps.td | 23 +++++++++++-
flang/lib/Lower/Bridge.cpp | 34 ++++++++++++++++-
flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 4 +-
flang/test/Lower/cuf_kernel_do_reduction.f90 | 37 +++++++++++++++++++
4 files changed, 94 insertions(+), 4 deletions(-)
create mode 100644 flang/test/Lower/cuf_kernel_do_reduction.f90
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 37b8da0181955..5c27b2e7f2938 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -17,6 +17,7 @@
include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
include "flang/Optimizer/Dialect/FIRTypes.td"
+include "flang/Optimizer/Dialect/FIRAttr.td"
include "mlir/Interfaces/LoopLikeInterface.td"
include "mlir/IR/BuiltinAttributes.td"
@@ -249,7 +250,9 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
Variadic<Index>:$lowerbound,
Variadic<Index>:$upperbound,
Variadic<Index>:$step,
- OptionalAttr<I64Attr>:$n
+ OptionalAttr<I64Attr>:$n,
+ Variadic<AnyType>:$reduceOperands,
+ OptionalAttr<ArrayAttr>:$reduceAttrs
);
let regions = (region AnyRegion:$region);
@@ -260,9 +263,27 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
( `,` `stream` `=` $stream^ )? `>` `>` `>`
custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
$upperbound, type($upperbound), $step, type($step))
+ `reduce_oprnds` $reduceOperands `:` type($reduceOperands)
attr-dict
}];
+ let extraClassDeclaration = [{
+ /// Get Number of variadic operands
+ unsigned getNumOperands(unsigned idx) {
+ auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>(
+ getOperandSegmentSizeAttr());
+ return static_cast<unsigned>(segments[idx]);
+ }
+ // Get Number of reduction operands
+ unsigned getNumReduceOperands() {
+ return getNumOperands(7);
+ }
+ /// Does the operation hold operands for reduction variables
+ bool hasReduceOperands() {
+ return getNumReduceOperands() > 0;
+ }
+ }];
+
let hasVerifier = 1;
}
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 202efa57d4a36..27c80bf3788b3 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2669,6 +2669,35 @@ class FirConverter : public Fortran::lower::AbstractConverter {
std::get<2>(dir.t);
const std::optional<Fortran::parser::ScalarIntExpr> &stream =
std::get<3>(dir.t);
+ const std::list<Fortran::parser::CUFReduction> &cufreds =
+ std::get<4>(dir.t);
+
+ llvm::SmallVector<mlir::Value> reduceOperands;
+ llvm::SmallVector<mlir::Attribute> reduceAttrs;
+
+ for (const Fortran::parser::CUFReduction &cufred : cufreds) {
+ fir::ReduceOperationEnum redOpEnum = getReduceOperationEnum(
+ std::get<Fortran::parser::ReductionOperator>(cufred.t));
+ const std::list<Fortran::parser::Scalar<Fortran::parser::Variable>>
+ &scalarvars = std::get<1>(cufred.t);
+ for (const Fortran::parser::Scalar<Fortran::parser::Variable> &scalarvar :
+ scalarvars) {
+ auto reduce_attr =
+ fir::ReduceAttr::get(builder->getContext(), redOpEnum);
+ reduceAttrs.push_back(reduce_attr);
+ const Fortran::parser::Variable &var = scalarvar.thing;
+ if (const auto *iDesignator = std::get_if<
+ Fortran::common::Indirection<Fortran::parser::Designator>>(
+ &var.u)) {
+ const Fortran::parser::Designator &designator = iDesignator->value();
+ if (const auto *name =
+ Fortran::semantics::getDesignatorNameIfDataRef(designator)) {
+ auto val = getSymbolAddress(*name->symbol);
+ reduceOperands.push_back(val);
+ }
+ }
+ }
+ }
auto isOnlyStars =
[&](const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr>
@@ -2771,8 +2800,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
}
- auto op = builder->create<cuf::KernelOp>(loc, gridValues, blockValues,
- streamValue, lbs, ubs, steps, n);
+ auto op = builder->create<cuf::KernelOp>(
+ loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n,
+ mlir::ValueRange(reduceOperands), builder->getArrayAttr(reduceAttrs));
builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
ivLocs);
mlir::Block &b = op.getRegion().back();
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 2c0c4c2cfae34..a807e21def27a 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -227,7 +227,9 @@ mlir::LogicalResult cuf::KernelOp::verify() {
getLowerbound().size() != getStep().size())
return emitOpError(
"expect same number of values in lowerbound, upperbound and step");
-
+ if (getReduceOperands().size() != getReduceAttrs()->size())
+ return emitOpError("expect same number of values in reduce operands and "
+ "reduce attributes");
return mlir::success();
}
diff --git a/flang/test/Lower/cuf_kernel_do_reduction.f90 b/flang/test/Lower/cuf_kernel_do_reduction.f90
new file mode 100644
index 0000000000000..7088c1df6cae3
--- /dev/null
+++ b/flang/test/Lower/cuf_kernel_do_reduction.f90
@@ -0,0 +1,37 @@
+! Test CUDA Fortran kernel do reduction
+! RUN: bbc -emit-fir -fcuda -o - %s | FileCheck %s
+
+module mod1
+contains
+ subroutine host_sub()
+ integer, parameter :: asize = 4
+ integer, device :: adev(asize)
+ integer :: ahost(asize)
+ integer :: q
+ integer, device :: add_reduce_var
+ integer, device :: mul_reduce_var
+ ! CHECK: %[[VAL_0:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadd_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ ! CHECK: %[[VAL_1:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEmul_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ do i = 1, asize
+ ahost(i) = i
+ enddo
+ adev = ahost
+ add_reduce_var = 0.0
+ mul_reduce_var = 1.0
+ ! CHECK: } reduce_oprnds %[[VAL_0:.*]], %[[VAL_1:.*]] : !fir.ref<i32>, !fir.ref<i32> {reduceAttrs = [#fir.reduce_attr<add>, #fir.reduce_attr<multiply>]}
+ !$cuf kernel do <<< *, * >>> reduce(+:add_reduce_var) reduce(*:mul_reduce_var)
+ do i = 1, asize
+ add_reduce_var = add_reduce_var + adev(i)
+ mul_reduce_var = mul_reduce_var * adev(i)
+ end do
+ q = rsum
+ ahost = adev
+ print *, q
+ end
+end
+
+program test
+ use mod1
+ implicit none
+ call host_sub()
+end program test
\ No newline at end of file
More information about the flang-commits
mailing list