[flang-commits] [flang] [flang][cuda] Lower ALLOCATE for device variable (PR #88980)
via flang-commits
flang-commits at lists.llvm.org
Tue Apr 16 13:16:18 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Replace the runtime call to `AllocatableAllocate` for CUDA device variable to the newly added `fir.cuda_allocate` operation.
---
Full diff: https://github.com/llvm/llvm-project/pull/88980.diff
2 Files Affected:
- (modified) flang/lib/Lower/Allocatable.cpp (+47-10)
- (added) flang/test/Lower/CUDA/cuda-allocatable.cuf (+107)
``````````diff
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 42e78fc96e4445..1d434d512d0c5c 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -14,6 +14,7 @@
#include "flang/Evaluate/tools.h"
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
+#include "flang/Lower/ConvertVariable.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
@@ -368,20 +369,17 @@ class AllocateStmtHelper {
[&](const Fortran::parser::AllocOpt::Mold &mold) {
moldExpr = Fortran::semantics::GetExpr(mold.v.value());
},
- [&](const Fortran::parser::AllocOpt::Stream &) {
- TODO(loc, "CUDA ALLOCATE(STREAM=)");
+ [&](const Fortran::parser::AllocOpt::Stream &stream) {
+ streamExpr = Fortran::semantics::GetExpr(stream.v.value());
},
- [&](const Fortran::parser::AllocOpt::Pinned &) {
- TODO(loc, "CUDA ALLOCATE(PINNED=)");
+ [&](const Fortran::parser::AllocOpt::Pinned &pinned) {
+ pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
},
},
allocOption.u);
}
void lowerAllocation(const Allocation &alloc) {
- if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
- TODO(loc, "Allocation of variable with CUDA attributes");
-
fir::MutableBoxValue boxAddr =
genMutableBoxValue(converter, loc, alloc.getAllocObj());
@@ -456,7 +454,8 @@ class AllocateStmtHelper {
const fir::MutableBoxValue &box) {
if (!box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
- !useAllocateRuntime && !box.isPointer()) {
+ !useAllocateRuntime && !box.isPointer() &&
+ !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
@@ -472,7 +471,12 @@ class AllocateStmtHelper {
genSetType(alloc, box, loc);
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
- mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
+ mlir::Value stat;
+ if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ stat = genRuntimeAllocate(builder, loc, box, errorManager);
+ else
+ stat =
+ genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
postAllocationAction(alloc);
errorManager.assignStat(builder, loc, stat);
@@ -602,7 +606,10 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
- if (isSource)
+ if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ stat =
+ genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+ else if (isSource)
stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
else
stat = genRuntimeAllocate(builder, loc, box, errorManager);
@@ -717,6 +724,34 @@ class AllocateStmtHelper {
return nullptr;
}
+ mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
+ const fir::MutableBoxValue &box,
+ ErrorManager &errorManager,
+ const Fortran::semantics::Symbol &sym) {
+ Fortran::lower::StatementContext stmtCtx;
+ fir::CUDADataAttributeAttr cudaAttr =
+ Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+ sym);
+ mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
+ mlir::Value stream =
+ streamExpr
+ ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+ : nullptr;
+ mlir::Value pinned =
+ pinnedExpr
+ ? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
+ : nullptr;
+ mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;
+
+ // Keep return type the same as a standard AllocatableAllocate call.
+ mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+ return builder
+ .create<fir::CUDAAllocateOp>(
+ loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
+ errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+ .getResult();
+ }
+
Fortran::lower::AbstractConverter &converter;
fir::FirOpBuilder &builder;
const Fortran::parser::AllocateStmt &stmt;
@@ -724,6 +759,8 @@ class AllocateStmtHelper {
const Fortran::lower::SomeExpr *moldExpr{nullptr};
const Fortran::lower::SomeExpr *statExpr{nullptr};
const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
+ const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
+ const Fortran::lower::SomeExpr *streamExpr{nullptr};
// If the allocate has a type spec, lenParams contains the
// value of the length parameters that were specified inside.
llvm::SmallVector<mlir::Value> lenParams;
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
new file mode 100644
index 00000000000000..55223011e8d9e9
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -0,0 +1,107 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA allocatable allocate/deallocate statements.
+
+subroutine sub1()
+ real, allocatable, device :: a(:)
+ allocate(a(10))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub2()
+ real, allocatable, managed :: a(:)
+ integer :: istat
+ allocate(a(10), stat=istat)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub2()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+
+subroutine sub3()
+ integer, allocatable, pinned :: a(:,:)
+ logical :: plog
+ allocate(a(20,30), pinned = plog)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub3()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK-2: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+
+subroutine sub4()
+ real, allocatable, unified :: a(:)
+ integer :: istream
+ allocate(a(10), stream=istream)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub4()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32
+
+subroutine sub5()
+ real, allocatable, device :: a(:)
+ real, allocatable :: b(:)
+ allocate(a, source=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub6()
+ real, allocatable, device :: a(:)
+ real, allocatable :: b(:)
+ allocate(a, mold=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableApplyMold
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub7()
+ real, allocatable, device :: a(:)
+ integer :: istat
+ character(50) :: err
+ allocate(a(100), stat=istat, errmsg=err)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
+! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
``````````
</details>
https://github.com/llvm/llvm-project/pull/88980
More information about the flang-commits
mailing list