[flang-commits] [flang] [flang][cuda] Do inline allocation/deallocation in device code (PR #106628)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Thu Aug 29 14:15:31 PDT 2024
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/106628
>From c5c91c5ff338cc4ff1c13cdd7998998ec2132229 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Thu, 29 Aug 2024 11:29:16 -0700
Subject: [PATCH 1/3] [flang][cuda] Do inline allocation/deallocation in device
code
ALLOCATE and DEALLOCATE statement can be inlined in device function.
This patch updates the condition that determined to inline these actions
in lowering.
---
flang/lib/Lower/Allocatable.cpp | 27 +++++++++++++++-------
flang/lib/Lower/Bridge.cpp | 26 ++-------------------
flang/test/Lower/CUDA/cuda-allocatable.cuf | 17 ++++++++++++++
3 files changed, 38 insertions(+), 32 deletions(-)
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index d4d999f5c84a09..fb8380ac7e8c51 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -15,6 +15,7 @@
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
+#include "flang/Lower/Cuda.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
@@ -453,16 +454,22 @@ class AllocateStmtHelper {
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
- if (!box.isDerived() && !errorManager.hasStatSpec() &&
- !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
- !useAllocateRuntime && !box.isPointer() &&
- !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
+ bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
+ bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
+ !alloc.type.IsPolymorphic() &&
+ !alloc.hasCoarraySpec() && !useAllocateRuntime &&
+ !box.isPointer();
+
+ if (inlineAllocation &&
+ ((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
postAllocationAction(alloc);
return;
}
+
// Generate a sequence of runtime calls.
errorManager.genStatCheck(builder, loc);
genAllocateObjectInit(box);
@@ -473,7 +480,7 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
- if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ if (!isCudaSymbol)
stat = genRuntimeAllocate(builder, loc, box, errorManager);
else
stat =
@@ -830,10 +837,14 @@ genDeallocate(fir::FirOpBuilder &builder,
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
- // Deallocate intrinsic types inline.
- if (!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
+ bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool inlineDeallocation =
+ !box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
- !useAllocateRuntime && !box.isPointer() && !isCudaSymbol) {
+ !useAllocateRuntime && !box.isPointer();
+ // Deallocate intrinsic types inline.
+ if (inlineDeallocation &&
+ ((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
// Pointers must use PointerDeallocate so that their deallocations
// can be validated.
mlir::Value ret = fir::factory::genFreemem(builder, loc, box);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 078e17bea55859..90943fa92493ce 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -20,6 +20,7 @@
#include "flang/Lower/ConvertExprToHLFIR.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
+#include "flang/Lower/Cuda.h"
#include "flang/Lower/HostAssociations.h"
#include "flang/Lower/IO.h"
#include "flang/Lower/IterationSpace.h"
@@ -4377,36 +4378,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
return temps;
}
- // Check if the insertion point is currently in a device context. HostDevice
- // subprogram are not considered fully device context so it will return false
- // for it.
- // If the insertion point is inside an OpenACC region op, it is considered
- // device context.
- static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
- if (builder.getRegion().getParentOfType<cuf::KernelOp>())
- return true;
- if (builder.getRegion()
- .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
- return true;
- if (auto funcOp =
- builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr =
- funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
- cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
- }
- }
- return false;
- }
-
void genDataAssignment(
const Fortran::evaluate::Assignment &assign,
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- bool isInDeviceContext = isCudaDeviceContext(builder);
+ bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCUDATransfer = (Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs) ||
Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs)) &&
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index cb6ca9af334fc5..fb72f88fe415ca 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -164,3 +164,20 @@ end subroutine
! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
! CHECK: }
! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
+
+attributes(global) subroutine sub8()
+ real, device, allocatable :: a(:)
+ allocate(a(2))
+ deallocate(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub8() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[HEAP:.*]] = fir.allocmem !fir.array<?xf32>, %{{.*}} {fir.must_be_heap = true, uniq_name = "_QFsub8Ea.alloc"}
+! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK: %[[EMBOX:.*]] = fir.embox %[[HEAP]](%[[SHAPE]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: fir.store %[[EMBOX]] to %[[A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[BOX:.*]] = fir.load %[[A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: fir.freemem %[[BOXADDR]] : !fir.heap<!fir.array<?xf32>>
>From 6ac6f512d3915d108b819ec06e9367a3b6c78bc9 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Thu, 29 Aug 2024 14:10:17 -0700
Subject: [PATCH 2/3] Add Cuda.h file
---
flang/include/flang/Lower/Cuda.h | 48 ++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
create mode 100644 flang/include/flang/Lower/Cuda.h
diff --git a/flang/include/flang/Lower/Cuda.h b/flang/include/flang/Lower/Cuda.h
new file mode 100644
index 00000000000000..138d3119bf6b68
--- /dev/null
+++ b/flang/include/flang/Lower/Cuda.h
@@ -0,0 +1,48 @@
+//===-- Lower/Cuda.h -- Cuda Fortran utilities ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_CUDA_H
+#define FORTRAN_LOWER_CUDA_H
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Semantics/tools.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+
+namespace Fortran::lower {
+// Check if the insertion point is currently in a device context. HostDevice
+// subprogram are not considered fully device context so it will return false
+// for it.
+// If the insertion point is inside an OpenACC region op, it is considered
+// device context.
+static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
+ if (builder.getRegion().getParentOfType<cuf::KernelOp>())
+ return true;
+ if (builder.getRegion()
+ .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
+ return true;
+ if (auto funcOp =
+ builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
+ if (auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName())) {
+ return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
+ cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
+ }
+ }
+ return false;
+}
+} // end namespace Fortran::lower
+
+
+#endif // FORTRAN_LOWER_CUDA_H
>From 828b10700a5f745a8f5d55d11ed2bdaab29e2922 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Thu, 29 Aug 2024 14:15:19 -0700
Subject: [PATCH 3/3] clang-format
---
flang/include/flang/Lower/Cuda.h | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/flang/include/flang/Lower/Cuda.h b/flang/include/flang/Lower/Cuda.h
index 138d3119bf6b68..e446b505ee6e45 100644
--- a/flang/include/flang/Lower/Cuda.h
+++ b/flang/include/flang/Lower/Cuda.h
@@ -31,18 +31,16 @@ static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
if (builder.getRegion()
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
- if (auto funcOp =
- builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
+ if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
- cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
+ cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}
} // end namespace Fortran::lower
-
#endif // FORTRAN_LOWER_CUDA_H
More information about the flang-commits
mailing list