[flang-commits] [flang] [flang][cuda] Use fir.cuda_deallocate for automatic deallocation (PR #89662)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Mon Apr 22 13:37:24 PDT 2024
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/89662
Automatic deallocation of allocatable that are cuda device variable must use the fir.cuda_deallocate operation. This patch update the automatic deallocation code generation to use this operation when the variable is a cuda variable.
This patch as also the side effect to correctly call `attachDeclarePostDeallocAction` for OpenACC declare variable on automatic deallocation as well. Update the code in `attachDeclarePostDeallocAction` so we do not attach on fir.result but on the correct last op.
>From 292e84309d4da052179a9708ade28a5ba23f9102 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
=?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
=?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Fri, 19 Apr 2024 14:49:56 -0700
Subject: [PATCH] [flang][cuda] Use fir.cuda_deallocate for automatic
deallocation (#89450)
Automatic deallocation of allocatable that are cuda device variable must
use the fir.cuda_deallocate operation. This patch update the automatic
deallocation code generation to use this operation when the variable is
a cuda variable.
---
flang/include/flang/Lower/Allocatable.h | 4 ++-
flang/lib/Lower/Allocatable.cpp | 12 +++++----
flang/lib/Lower/ConvertVariable.cpp | 5 ++--
flang/lib/Lower/OpenACC.cpp | 29 +++++++++++----------
flang/test/Lower/CUDA/cuda-allocatable.cuf | 30 ++++++++++++++++++++++
flang/test/Lower/OpenACC/acc-declare.f90 | 5 ++++
6 files changed, 64 insertions(+), 21 deletions(-)
diff --git a/flang/include/flang/Lower/Allocatable.h b/flang/include/flang/Lower/Allocatable.h
index d3c16de377c1d7a..e8738f0407e77ff 100644
--- a/flang/include/flang/Lower/Allocatable.h
+++ b/flang/include/flang/Lower/Allocatable.h
@@ -55,12 +55,14 @@ void genDeallocateStmt(AbstractConverter &converter,
void genDeallocateBox(AbstractConverter &converter,
const fir::MutableBoxValue &box, mlir::Location loc,
+ const Fortran::semantics::Symbol *sym = nullptr,
mlir::Value declaredTypeDesc = {});
/// Deallocate an allocatable if it is allocated at the end of its lifetime.
void genDeallocateIfAllocated(AbstractConverter &converter,
const fir::MutableBoxValue &box,
- mlir::Location loc);
+ mlir::Location loc,
+ const Fortran::semantics::Symbol *sym = nullptr);
/// Create a MutableBoxValue for an allocatable or pointer entity.
/// If the variables is a local variable that is not a dummy, it will be
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 38f61528d7e28ad..8e84ea2fc5d5223 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -859,18 +859,20 @@ genDeallocate(fir::FirOpBuilder &builder,
void Fortran::lower::genDeallocateBox(
Fortran::lower::AbstractConverter &converter,
const fir::MutableBoxValue &box, mlir::Location loc,
- mlir::Value declaredTypeDesc) {
+ const Fortran::semantics::Symbol *sym, mlir::Value declaredTypeDesc) {
const Fortran::lower::SomeExpr *statExpr = nullptr;
const Fortran::lower::SomeExpr *errMsgExpr = nullptr;
ErrorManager errorManager;
errorManager.init(converter, loc, statExpr, errMsgExpr);
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
- genDeallocate(builder, converter, loc, box, errorManager, declaredTypeDesc);
+ genDeallocate(builder, converter, loc, box, errorManager, declaredTypeDesc,
+ sym);
}
void Fortran::lower::genDeallocateIfAllocated(
Fortran::lower::AbstractConverter &converter,
- const fir::MutableBoxValue &box, mlir::Location loc) {
+ const fir::MutableBoxValue &box, mlir::Location loc,
+ const Fortran::semantics::Symbol *sym) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
mlir::Value isAllocated =
fir::factory::genIsAllocatedOrAssociatedTest(builder, loc, box);
@@ -880,9 +882,9 @@ void Fortran::lower::genDeallocateIfAllocated(
eleType.isa<fir::RecordType>() && box.isPolymorphic()) {
mlir::Value declaredTypeDesc = builder.create<fir::TypeDescOp>(
loc, mlir::TypeAttr::get(eleType));
- genDeallocateBox(converter, box, loc, declaredTypeDesc);
+ genDeallocateBox(converter, box, loc, sym, declaredTypeDesc);
} else {
- genDeallocateBox(converter, box, loc);
+ genDeallocateBox(converter, box, loc, sym);
}
})
.end();
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 2d2d9eba905bdd5..c40435c0977c743 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -916,13 +916,14 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
break;
case VariableCleanUp::Deallocate:
auto *converterPtr = &converter;
- converter.getFctCtx().attachCleanup([converterPtr, loc, exv]() {
+ auto *sym = &var.getSymbol();
+ converter.getFctCtx().attachCleanup([converterPtr, loc, exv, sym]() {
const fir::MutableBoxValue *mutableBox =
exv.getBoxOf<fir::MutableBoxValue>();
assert(mutableBox &&
"trying to deallocate entity not lowered as allocatable");
Fortran::lower::genDeallocateIfAllocated(*converterPtr, *mutableBox,
- loc);
+ loc, sym);
});
}
}
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d933c07aba0e0c0..596c00c9ddb1556 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4187,21 +4187,24 @@ void Fortran::lower::attachDeclarePostDeallocAction(
std::stringstream fctName;
fctName << converter.mangleName(sym) << declarePostDeallocSuffix.str();
- mlir::Operation &op = builder.getInsertionBlock()->back();
- if (op.hasAttr(mlir::acc::getDeclareActionAttrName())) {
- auto attr = op.getAttrOfType<mlir::acc::DeclareActionAttr>(
+ mlir::Operation *op = &builder.getInsertionBlock()->back();
+ if (mlir::isa<fir::ResultOp>(*op))
+ op = op->getPrevNode();
+ assert(op && "expect operation to attach the post deallocation action");
+ if (op->hasAttr(mlir::acc::getDeclareActionAttrName())) {
+ auto attr = op->getAttrOfType<mlir::acc::DeclareActionAttr>(
mlir::acc::getDeclareActionAttrName());
- op.setAttr(mlir::acc::getDeclareActionAttrName(),
- mlir::acc::DeclareActionAttr::get(
- builder.getContext(), attr.getPreAlloc(),
- attr.getPostAlloc(), attr.getPreDealloc(),
- /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
+ op->setAttr(mlir::acc::getDeclareActionAttrName(),
+ mlir::acc::DeclareActionAttr::get(
+ builder.getContext(), attr.getPreAlloc(),
+ attr.getPostAlloc(), attr.getPreDealloc(),
+ /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
} else {
- op.setAttr(mlir::acc::getDeclareActionAttrName(),
- mlir::acc::DeclareActionAttr::get(
- builder.getContext(),
- /*preAlloc=*/{}, /*postAlloc=*/{}, /*preDealloc=*/{},
- /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
+ op->setAttr(mlir::acc::getDeclareActionAttrName(),
+ mlir::acc::DeclareActionAttr::get(
+ builder.getContext(),
+ /*preAlloc=*/{}, /*postAlloc=*/{}, /*preDealloc=*/{},
+ /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
}
}
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 251ff16a56c797c..eff5f13669e904c 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -17,6 +17,15 @@ end subroutine
! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %[[BOX_LOAD:.*]] = fir.load %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[ADDR:.*]] = fir.box_addr %[[BOX_LOAD]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: %[[ADDR_I64:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xf32>>) -> i64
+! CHECK: %[[C0:.*]] = arith.constant 0 : i64
+! CHECK: %[[NE_C0:.*]] = arith.cmpi ne, %[[ADDR_I64]], %[[C0]] : i64
+! CHECK: fir.if %[[NE_C0]] {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
+
subroutine sub2()
real, allocatable, managed :: a(:)
integer :: istat
@@ -37,6 +46,10 @@ end subroutine
! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>} -> i32
+! CHECK: }
+
subroutine sub3()
integer, allocatable, pinned :: a(:,:)
logical :: plog
@@ -50,6 +63,9 @@ end subroutine
! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
! CHECK-2: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK: }
subroutine sub4()
real, allocatable, device :: a(:)
@@ -65,6 +81,9 @@ end subroutine
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub5()
real, allocatable, device :: a(:)
@@ -80,6 +99,11 @@ end subroutine
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if
+! CHECK: fir.freemem
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub6()
real, allocatable, device :: a(:)
@@ -95,6 +119,9 @@ end subroutine
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableApplyMold
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub7()
real, allocatable, device :: a(:)
@@ -120,3 +147,6 @@ end subroutine
! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%15 : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 401b654adeb61b6..5d3f9e3fe97e4a1 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -245,6 +245,11 @@ subroutine acc_declare_allocate()
! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.if
+! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.store %{{.*}} to %{{.*}}#1 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: }
+
end subroutine
! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc(
More information about the flang-commits
mailing list