[flang-commits] [flang] a3ccaed - [flang][cuda] Allocate local descriptor in managed memory (#102060)

Tue Aug 6 11:17:15 PDT 2024

Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-08-06T11:17:11-07:00
New Revision: a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2

URL: https://github.com/llvm/llvm-project/commit/a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2
DIFF: https://github.com/llvm/llvm-project/commit/a3ccaed3b9f6a1fe9b7f2ef019259f88072639b2.diff

LOG: [flang][cuda] Allocate local descriptor in managed memory (#102060)

This patch adds entry point in the runtime to be able to allocate
descriptors in managed memory. These entry points currently only call
`CUFAllocManaged` and `CUFFreeManaged` but could be more complicated in
the future.

`cuf.alloc` and `cuf.free` related to local descriptors are converted
into runtime calls.

Added: 
    flang/include/flang/Runtime/CUDA/descriptor.h
    flang/runtime/CUDA/descriptor.cpp

Modified: 
    flang/lib/Optimizer/Transforms/CufOpConversion.cpp
    flang/runtime/CUDA/CMakeLists.txt
    flang/test/Fir/CUDA/cuda-allocate.fir
    flang/unittests/Runtime/CUDA/AllocatorCUF.cpp

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
new file mode 100644
index 00000000000000..33b993b219f297

--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+#include <cstddef>
+
+namespace Fortran::runtime::cuf {
+
+extern "C" {
+
+// Allocate a descriptor in managed.
+Descriptor *RTDECL(CUFAllocDesciptor)(
+    std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
+
+// Deallocate a descriptor allocated in managed or unified memory.
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
+} // extern "C"
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_

diff  --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index bdeaaab9f9d1d7..61c95843a34316 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -8,10 +8,13 @@
 
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -25,6 +28,7 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 namespace {
 
@@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
 }
 
 struct CufAllocateOpConversion
-    : public mlir::OpRewritePattern<cuf::AllocateOp> {
+    : public mlir::OpRewritePattern<::cuf::AllocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::AllocateOp op,
+  matchAndRewrite(::cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation with source will need a new entry point in the runtime.
     if (op.getSource())
@@ -108,16 +112,16 @@ struct CufAllocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
                                                                    builder);
-    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
   }
 };
 
 struct CufDeallocateOpConversion
-    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+    : public mlir::OpRewritePattern<::cuf::DeallocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::DeallocateOp op,
+  matchAndRewrite(::cuf::DeallocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
@@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
                                                                      builder);
-    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
+  }
+};
+
+struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+                       fir::LLVMTypeConverter *typeConverter)
+      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::AllocOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+
+    // Only convert cuf.alloc that allocates a descriptor.
+    if (!boxTy)
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+    mlir::Value sizeInBytes =
+        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                        callOp.getResult(0));
+    rewriter.replaceOp(op, convOp);
+    return mlir::success();
+  }
+
+private:
+  mlir::DataLayout *dl;
+  fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::FreeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Only convert cuf.free on descriptor.
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+    builder.create<fir::CallOp>(loc, func, args);
+    rewriter.eraseOp(op);
+    return mlir::success();
   }
 };
 
@@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
     auto *ctx = &getContext();
     mlir::RewritePatternSet patterns(ctx);
     mlir::ConversionTarget target(*ctx);
-    target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
-    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);
+
+    mlir::Operation *op = getOperation();
+    mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+    if (!module)
+      return signalPassFailure();
+
+    std::optional<mlir::DataLayout> dl =
+        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+                                         /*forceUnifiedTBAATree=*/false, *dl);
+
+    target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
+                        ::cuf::FreeOp>();
+    patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
+    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
+                    CufFreeOpConversion>(ctx);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(ctx),

diff  --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index de1104f07ce6c2..88243536139e46 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR
 
 add_flang_library(CufRuntime
   allocator.cpp
+  descriptor.cpp
 )
 target_link_libraries(CufRuntime
   PRIVATE

diff  --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
new file mode 100644
index 00000000000000..e228c0f76aae09
--- /dev/null
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/allocator.h"
+
+namespace Fortran::runtime::cuf {
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+Descriptor *RTDEF(CUFAllocDesciptor)(
+    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
+  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+}
+
+void RTDEF(CUFFreeDesciptor)(
+    Descriptor *desc, const char *sourceFile, int sourceLine) {
+  CUFFreeManaged(reinterpret_cast<void *>(desc));
+}
+
+RT_EXT_API_GROUP_END
+}
+} // namespace Fortran::runtime::cuf

diff  --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ab4a253f33dd89..1274d3921dd854 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -1,5 +1,7 @@
 // RUN: fir-opt --cuf-convert %s | FileCheck %s
 
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
 func.func @_QPsub1() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
@@ -8,14 +10,21 @@ func.func @_QPsub1() {
   %c0_i32 = arith.constant 0 : i32
   %9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
   %10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
   return
 }
 
+
 // CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+
+}

diff  --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f372ae18c202f9..2355c47778cca7 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,12 +10,14 @@
 #include "../../../runtime/terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda.h"
 
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 static OwningPtr<Descriptor> createAllocatable(
     Fortran::common::TypeCategory tc, int kind, int rank = 1) {
@@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_FALSE(a->IsAllocated());
 }
+
+TEST(AllocatableCUFTest, DescriptorAllocationTest) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  Descriptor *desc = nullptr;
+  desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
+  EXPECT_TRUE(desc != nullptr);
+  RTNAME(CUFFreeDesciptor)(desc);
+}