[flang-commits] [flang] a3ccaed - [flang][cuda] Allocate local descriptor in managed memory (#102060)
via flang-commits
flang-commits at lists.llvm.org
Tue Aug 6 11:17:15 PDT 2024
Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-08-06T11:17:11-07:00
New Revision: a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2
URL: https://github.com/llvm/llvm-project/commit/a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2
DIFF: https://github.com/llvm/llvm-project/commit/a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2.diff
LOG: [flang][cuda] Allocate local descriptor in managed memory (#102060)
This patch adds entry point in the runtime to be able to allocate
descriptors in managed memory. These entry points currently only call
`CUFAllocManaged` and `CUFFreeManaged` but could be more complicated in
the future.
`cuf.alloc` and `cuf.free` related to local descriptors are converted
into runtime calls.
Added:
flang/include/flang/Runtime/CUDA/descriptor.h
flang/runtime/CUDA/descriptor.cpp
Modified:
flang/lib/Optimizer/Transforms/CufOpConversion.cpp
flang/runtime/CUDA/CMakeLists.txt
flang/test/Fir/CUDA/cuda-allocate.fir
flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
Removed:
################################################################################
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
new file mode 100644
index 00000000000000..33b993b219f297
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+#include <cstddef>
+
+namespace Fortran::runtime::cuf {
+
+extern "C" {
+
+// Allocate a descriptor in managed.
+Descriptor *RTDECL(CUFAllocDesciptor)(
+ std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
+
+// Deallocate a descriptor allocated in managed or unified memory.
+void RTDECL(CUFFreeDesciptor)(
+ Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
+} // extern "C"
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index bdeaaab9f9d1d7..61c95843a34316 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -8,10 +8,13 @@
#include "flang/Common/Fortran.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/allocatable.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
@@ -25,6 +28,7 @@ namespace fir {
using namespace fir;
using namespace mlir;
using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
namespace {
@@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
struct CufAllocateOpConversion
- : public mlir::OpRewritePattern<cuf::AllocateOp> {
+ : public mlir::OpRewritePattern<::cuf::AllocateOp> {
using OpRewritePattern::OpRewritePattern;
mlir::LogicalResult
- matchAndRewrite(cuf::AllocateOp op,
+ matchAndRewrite(::cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
// TODO: Allocation with source will need a new entry point in the runtime.
if (op.getSource())
@@ -108,16 +112,16 @@ struct CufAllocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
builder);
- return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+ return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
}
};
struct CufDeallocateOpConversion
- : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+ : public mlir::OpRewritePattern<::cuf::DeallocateOp> {
using OpRewritePattern::OpRewritePattern;
mlir::LogicalResult
- matchAndRewrite(cuf::DeallocateOp op,
+ matchAndRewrite(::cuf::DeallocateOp op,
mlir::PatternRewriter &rewriter) const override {
// TODO: Allocation of module variable will need more work as the descriptor
// will be duplicated and needs to be synced after allocation.
@@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
builder);
- return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+ return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
+ }
+};
+
+struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+ fir::LLVMTypeConverter *typeConverter)
+ : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+ mlir::LogicalResult
+ matchAndRewrite(::cuf::AllocOp op,
+ mlir::PatternRewriter &rewriter) const override {
+ auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+
+ // Only convert cuf.alloc that allocates a descriptor.
+ if (!boxTy)
+ return failure();
+
+ auto mod = op->getParentOfType<mlir::ModuleOp>();
+ fir::FirOpBuilder builder(rewriter, mod);
+ mlir::Location loc = op.getLoc();
+ mlir::func::FuncOp func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
+
+ auto fTy = func.getFunctionType();
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+ mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+ std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+ mlir::Value sizeInBytes =
+ builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+ auto callOp = builder.create<fir::CallOp>(loc, func, args);
+ auto convOp = builder.createConvert(loc, op.getResult().getType(),
+ callOp.getResult(0));
+ rewriter.replaceOp(op, convOp);
+ return mlir::success();
+ }
+
+private:
+ mlir::DataLayout *dl;
+ fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ mlir::LogicalResult
+ matchAndRewrite(::cuf::FreeOp op,
+ mlir::PatternRewriter &rewriter) const override {
+ // Only convert cuf.free on descriptor.
+ if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+ return failure();
+ auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+ if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
+ return failure();
+
+ auto mod = op->getParentOfType<mlir::ModuleOp>();
+ fir::FirOpBuilder builder(rewriter, mod);
+ mlir::Location loc = op.getLoc();
+ mlir::func::FuncOp func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
+
+ auto fTy = func.getFunctionType();
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+ builder.create<fir::CallOp>(loc, func, args);
+ rewriter.eraseOp(op);
+ return mlir::success();
}
};
@@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
auto *ctx = &getContext();
mlir::RewritePatternSet patterns(ctx);
mlir::ConversionTarget target(*ctx);
- target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
- patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);
+
+ mlir::Operation *op = getOperation();
+ mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+ if (!module)
+ return signalPassFailure();
+
+ std::optional<mlir::DataLayout> dl =
+ fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+ fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+ /*forceUnifiedTBAATree=*/false, *dl);
+
+ target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
+ ::cuf::FreeOp>();
+ patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
+ patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
+ CufFreeOpConversion>(ctx);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(ctx),
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index de1104f07ce6c2..88243536139e46 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR
add_flang_library(CufRuntime
allocator.cpp
+ descriptor.cpp
)
target_link_libraries(CufRuntime
PRIVATE
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
new file mode 100644
index 00000000000000..e228c0f76aae09
--- /dev/null
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/allocator.h"
+
+namespace Fortran::runtime::cuf {
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+Descriptor *RTDEF(CUFAllocDesciptor)(
+ std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
+ return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+}
+
+void RTDEF(CUFFreeDesciptor)(
+ Descriptor *desc, const char *sourceFile, int sourceLine) {
+ CUFFreeManaged(reinterpret_cast<void *>(desc));
+}
+
+RT_EXT_API_GROUP_END
+}
+} // namespace Fortran::runtime::cuf
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ab4a253f33dd89..1274d3921dd854 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -1,5 +1,7 @@
// RUN: fir-opt --cuf-convert %s | FileCheck %s
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
func.func @_QPsub1() {
%0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
%4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
@@ -8,14 +10,21 @@ func.func @_QPsub1() {
%c0_i32 = arith.constant 0 : i32
%9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
%10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+ cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
return
}
+
// CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+
+}
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f372ae18c202f9..2355c47778cca7 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,12 +10,14 @@
#include "../../../runtime/terminator.h"
#include "flang/Common/Fortran.h"
#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/allocatable.h"
#include "flang/Runtime/allocator-registry.h"
#include "cuda.h"
using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
static OwningPtr<Descriptor> createAllocatable(
Fortran::common::TypeCategory tc, int kind, int rank = 1) {
@@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_FALSE(a->IsAllocated());
}
+
+TEST(AllocatableCUFTest, DescriptorAllocationTest) {
+ using Fortran::common::TypeCategory;
+ Fortran::runtime::cuf::CUFRegisterAllocator();
+ ScopedContext ctx;
+ // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Real, 4)};
+ Descriptor *desc = nullptr;
+ desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
+ EXPECT_TRUE(desc != nullptr);
+ RTNAME(CUFFreeDesciptor)(desc);
+}
More information about the flang-commits
mailing list