[flang-commits] [flang] [flang][cuda] Sync double descriptor after c_f_pointer call (PR #130194)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Thu Mar 6 14:36:37 PST 2025
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/130194
After a global device pointer is set through `c_f_pointer`, we need to sync the double descriptor so the version on the device is also up to date.
>From c5f2351a6fccbf5dce0db02cc728c2d280238172 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Thu, 6 Mar 2025 13:58:19 -0800
Subject: [PATCH] [flang][cuda] Sync double descriptor after c_f_pointer call
---
flang/include/flang/Lower/Cuda.h | 21 ----------
.../flang/Optimizer/Builder/CUFCommon.h | 4 +-
.../Builder/Runtime/CUDA/Descriptor.h | 31 ++++++++++++++
flang/lib/Lower/Allocatable.cpp | 4 +-
flang/lib/Lower/Bridge.cpp | 2 +-
flang/lib/Optimizer/Builder/CMakeLists.txt | 1 +
flang/lib/Optimizer/Builder/CUFCommon.cpp | 40 +++++++++++++------
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 13 ++++++
.../Builder/Runtime/CUDA/Descriptor.cpp | 34 ++++++++++++++++
.../Optimizer/Transforms/CUFOpConversion.cpp | 12 +-----
.../Transforms/SimplifyIntrinsics.cpp | 2 +-
flang/test/Lower/CUDA/cuda-pointer.cuf | 23 ++++++++++-
12 files changed, 138 insertions(+), 49 deletions(-)
create mode 100644 flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
create mode 100644 flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
diff --git a/flang/include/flang/Lower/Cuda.h b/flang/include/flang/Lower/Cuda.h
index d97045383d195..b6f849e3d63f0 100644
--- a/flang/include/flang/Lower/Cuda.h
+++ b/flang/include/flang/Lower/Cuda.h
@@ -20,27 +20,6 @@
#include "mlir/Dialect/OpenACC/OpenACC.h"
namespace Fortran::lower {
-// Check if the insertion point is currently in a device context. HostDevice
-// subprogram are not considered fully device context so it will return false
-// for it.
-// If the insertion point is inside an OpenACC region op, it is considered
-// device context.
-static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) {
- if (builder.getRegion().getParentOfType<cuf::KernelOp>())
- return true;
- if (builder.getRegion()
- .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
- return true;
- if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr =
- funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
- cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
- }
- }
- return false;
-}
static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index b99e330429622..e3c7b5098b83f 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -25,8 +25,10 @@ namespace cuf {
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
-bool isInCUDADeviceContext(mlir::Operation *op);
+bool isCUDADeviceContext(mlir::Operation *op);
+bool isCUDADeviceContext(mlir::Region &);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
+bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
new file mode 100644
index 0000000000000..14d262bf22a70
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -0,0 +1,31 @@
+//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+}
+
+namespace fir::runtime::cuda {
+
+/// Generate runtime call to sync the doublce descriptor referenced by
+/// \p hostPtr.
+void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value hostPtr);
+
+} // namespace fir::runtime::cuda
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 3d21e7a3fa8d5..9938bd573d1fa 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -470,7 +470,7 @@ class AllocateStmtHelper {
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
- bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() &&
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
@@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder,
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
- bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineDeallocation =
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 95f431983d442..e368974c92a3e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4689,7 +4689,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index f0563d092e3dc..31ae395805faf 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -18,6 +18,7 @@ add_flang_library(FIRBuilder
Runtime/Assign.cpp
Runtime/Character.cpp
Runtime/Command.cpp
+ Runtime/CUDA/Descriptor.cpp
Runtime/Derived.cpp
Runtime/EnvironmentDefaults.cpp
Runtime/Exceptions.cpp
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 39848205f47af..5f286c04a7ca0 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -12,6 +12,7 @@
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
@@ -31,25 +32,34 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
return gpuMod;
}
-bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
- if (!op)
+bool cuf::isCUDADeviceContext(mlir::Operation *op) {
+ if (!op || !op->getParentRegion())
return false;
- if (op->getParentOfType<cuf::KernelOp>() ||
- op->getParentOfType<mlir::gpu::GPUFuncOp>())
+ return isCUDADeviceContext(*op->getParentRegion());
+}
+
+// Check if the insertion point is currently in a device context. HostDevice
+// subprogram are not considered fully device context so it will return false
+// for it.
+// If the insertion point is inside an OpenACC region op, it is considered
+// device context.
+bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
+ if (region.getParentOfType<cuf::KernelOp>())
+ return true;
+ if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
- if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
+ if (auto funcOp = region.getParentOfType<mlir::func::FuncOp>()) {
+ if (auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName())) {
+ return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
+ cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}
-bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
- if (op.getConstant())
- return false;
- auto attr = op.getDataAttr();
+bool cuf::isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr) {
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
@@ -57,6 +67,12 @@ bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
return false;
}
+bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
+ if (op.getConstant())
+ return false;
+ return isRegisteredDeviceAttr(op.getDataAttr());
+}
+
void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
if (auto declareOp = box.getDefiningOp<hlfir::DeclareOp>()) {
if (auto addrOfOp = declareOp.getMemref().getDefiningOp<fir::AddrOfOp>()) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index c52b0cbaf2019..8370e82c10b67 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,12 +16,14 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Common/static-multimap-view.h"
#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Character.h"
#include "flang/Optimizer/Builder/Complex.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/MutableBox.h"
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
#include "flang/Optimizer/Builder/Runtime/Allocatable.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/Character.h"
#include "flang/Optimizer/Builder/Runtime/Command.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
@@ -38,6 +40,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "flang/Optimizer/Support/Utils.h"
#include "flang/Runtime/entry-names.h"
@@ -3254,6 +3257,16 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr),
/*lbounds=*/mlir::ValueRange{});
+
+ // If the pointer is a registered CUDA fortran variable, the descriptor needs
+ // to be synced.
+ if (auto declare = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
+ fPtr->getAddr().getDefiningOp()))
+ if (mlir::isa<fir::AddrOfOp>(declare.getMemref().getDefiningOp()))
+ if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) &&
+ !cuf::isCUDADeviceContext(builder.getRegion()))
+ fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc,
+ declare.getMemref());
}
// C_F_PROCPOINTER
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
new file mode 100644
index 0000000000000..90662c094c65e
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -0,0 +1,34 @@
+
+//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+
+using namespace Fortran::runtime::cuda;
+
+void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value hostPtr) {
+ mlir::func::FuncOp callee =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
+ builder);
+ auto fTy = callee.getFunctionType();
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
+ builder.create<fir::CallOp>(loc, callee, args);
+}
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 2ab2d84f1643d..0fbec8a204b8d 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -8,6 +8,7 @@
#include "flang/Optimizer/Transforms/CUFOpConversion.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion
auto hostAddr = builder.create<fir::AddrOfOp>(
loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
- mlir::func::FuncOp callee =
- fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
- builder);
- auto fTy = callee.getFunctionType();
- mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
- mlir::Value sourceLine =
- fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
- builder.create<fir::CallOp>(loc, callee, args);
+ fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
op.erase();
return mlir::success();
}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index df2887ff1422e..2484f4f6b99a4 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1277,7 +1277,7 @@ void SimplifyIntrinsicsPass::runOnOperation() {
fir::KindMapping kindMap = fir::getKindMapping(module);
module.walk([&](mlir::Operation *op) {
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
- if (cuf::isInCUDADeviceContext(op))
+ if (cuf::isCUDADeviceContext(op))
return;
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
mlir::StringRef funcName = callee.getLeafReference().getValue();
diff --git a/flang/test/Lower/CUDA/cuda-pointer.cuf b/flang/test/Lower/CUDA/cuda-pointer.cuf
index 2a9dbe54c2922..e9614751673e0 100644
--- a/flang/test/Lower/CUDA/cuda-pointer.cuf
+++ b/flang/test/Lower/CUDA/cuda-pointer.cuf
@@ -2,10 +2,31 @@
! Test lowering of CUDA pointers.
+module mod1
+
+integer, device, pointer :: x(:)
+
+contains
+
subroutine allocate_pointer
real, device, pointer :: pr(:)
allocate(pr(10))
end
-! CHECK-LABEL: func.func @_QPallocate_pointer()
+! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer()
! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+
+subroutine c_f_pointer_sync
+ use iso_c_binding
+ use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
+ type(c_devptr) :: cd1
+ integer, parameter :: N = 2000
+ call c_f_pointer(cd1, x, (/ 2000 /))
+end
+
+! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync()
+! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<i8>
+! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> ()
+
+end module
More information about the flang-commits
mailing list