[flang-commits] [flang] [flang][cuda] Allocate local descriptor in managed memory (PR #102060)

Mon Aug 5 18:12:28 PDT 2024

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/102060

>From c1c0921c3deef446c471d14191ff6368737ad732 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 2 Aug 2024 14:43:39 -0700
Subject: [PATCH 1/2] [flang][cuda] Add entry point to allocate descriptor in
 managed memory

---
 flang/include/flang/Runtime/CUDA/descriptor.h |  30 +++++
 .../Optimizer/Transforms/CufOpConversion.cpp  | 111 ++++++++++++++++--
 flang/runtime/CUDA/CMakeLists.txt             |   1 +
 flang/runtime/CUDA/descriptor.cpp             |  28 +++++
 flang/test/Fir/CUDA/cuda-allocate.fir         |  11 +-
 flang/unittests/Runtime/CUDA/AllocatorCUF.cpp |  14 +++
 6 files changed, 186 insertions(+), 9 deletions(-)
 create mode 100644 flang/include/flang/Runtime/CUDA/descriptor.h
 create mode 100644 flang/runtime/CUDA/descriptor.cpp

diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
new file mode 100644
index 0000000000000..6b8b4c555a728
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+#include <cstddef>
+
+namespace Fortran::runtime::cuf {
+
+extern "C" {
+
+// Allocate a descriptor in managed or unified memory.
+Descriptor *RTDECL(CUFAllocDesciptor)(
+    std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
+
+// Deallocate a descriptor allocated in managed or unified memory.
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
+} // extern "C"
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index bdeaaab9f9d1d..61c95843a3431 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -8,10 +8,13 @@
 
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -25,6 +28,7 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 namespace {
 
@@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
 }
 
 struct CufAllocateOpConversion
-    : public mlir::OpRewritePattern<cuf::AllocateOp> {
+    : public mlir::OpRewritePattern<::cuf::AllocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::AllocateOp op,
+  matchAndRewrite(::cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation with source will need a new entry point in the runtime.
     if (op.getSource())
@@ -108,16 +112,16 @@ struct CufAllocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
                                                                    builder);
-    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
   }
 };
 
 struct CufDeallocateOpConversion
-    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+    : public mlir::OpRewritePattern<::cuf::DeallocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::DeallocateOp op,
+  matchAndRewrite(::cuf::DeallocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
@@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
                                                                      builder);
-    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
+  }
+};
+
+struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+                       fir::LLVMTypeConverter *typeConverter)
+      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::AllocOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+
+    // Only convert cuf.alloc that allocates a descriptor.
+    if (!boxTy)
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+    mlir::Value sizeInBytes =
+        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                        callOp.getResult(0));
+    rewriter.replaceOp(op, convOp);
+    return mlir::success();
+  }
+
+private:
+  mlir::DataLayout *dl;
+  fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::FreeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Only convert cuf.free on descriptor.
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+    builder.create<fir::CallOp>(loc, func, args);
+    rewriter.eraseOp(op);
+    return mlir::success();
   }
 };
 
@@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
     auto *ctx = &getContext();
     mlir::RewritePatternSet patterns(ctx);
     mlir::ConversionTarget target(*ctx);
-    target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
-    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);
+
+    mlir::Operation *op = getOperation();
+    mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+    if (!module)
+      return signalPassFailure();
+
+    std::optional<mlir::DataLayout> dl =
+        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+                                         /*forceUnifiedTBAATree=*/false, *dl);
+
+    target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
+                        ::cuf::FreeOp>();
+    patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
+    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
+                    CufFreeOpConversion>(ctx);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(ctx),
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index de1104f07ce6c..88243536139e4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR
 
 add_flang_library(CufRuntime
   allocator.cpp
+  descriptor.cpp
 )
 target_link_libraries(CufRuntime
   PRIVATE
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
new file mode 100644
index 0000000000000..bfac336b5b681
--- /dev/null
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/allocator.h"
+
+namespace Fortran::runtime::cuf {
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+Descriptor *RTDEF(CUFAllocDesciptor)(
+    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
+  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+}
+
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *desc, const char *sourceFile, int sourceLine) {
+  CUFFreeManaged(reinterpret_cast<void *>(desc));
+}
+
+RT_EXT_API_GROUP_END
+}
+} // namespace Fortran::runtime::cuf
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ab4a253f33dd8..1274d3921dd85 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -1,5 +1,7 @@
 // RUN: fir-opt --cuf-convert %s | FileCheck %s
 
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
 func.func @_QPsub1() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
@@ -8,14 +10,21 @@ func.func @_QPsub1() {
   %c0_i32 = arith.constant 0 : i32
   %9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
   %10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
   return
 }
 
+
 // CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+
+}
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f372ae18c202f..2355c47778cca 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,12 +10,14 @@
 #include "../../../runtime/terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda.h"
 
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 static OwningPtr<Descriptor> createAllocatable(
     Fortran::common::TypeCategory tc, int kind, int rank = 1) {
@@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_FALSE(a->IsAllocated());
 }
+
+TEST(AllocatableCUFTest, DescriptorAllocationTest) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  Descriptor *desc = nullptr;
+  desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
+  EXPECT_TRUE(desc != nullptr);
+  RTNAME(CUFFreeDesciptor)(desc);
+}

>From e7bd6545d5e926a9cb5e4edbae69d548820c69bd Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 5 Aug 2024 18:12:17 -0700
Subject: [PATCH 2/2] Switch to REDEF

---
 flang/runtime/CUDA/descriptor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index bfac336b5b681..e228c0f76aae0 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -18,7 +18,7 @@ Descriptor *RTDEF(CUFAllocDesciptor)(
   return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
 }
 
-void RTDECL(CUFFreeDesciptor)(
+void RTDEF(CUFFreeDesciptor)(
     Descriptor *desc, const char *sourceFile, int sourceLine) {
   CUFFreeManaged(reinterpret_cast<void *>(desc));
 }