[flang] [llvm] [flang][cuda] Support non-allocatable module-level managed variables (PR #189753)
Zhen Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 10:05:03 PDT 2026
https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/189753
>From 94cc58a8864c97a36d64742e17ebdcfd00c7620f Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 31 Mar 2026 13:57:36 -0700
Subject: [PATCH 1/2] [flang][cuda] Support non-allocatable module-level
managed variables
Add support for non-allocatable module-level CUDA managed variables
using pointer indirection through a companion global in
__nv_managed_data__. The CUDA runtime populates this pointer with the
unified memory address via __cudaRegisterManagedVar and __cudaInitModule.
- Create a .managed.ptr companion global in the __nv_managed_data__
section and register it with _FortranACUFRegisterManagedVariable
- Call __cudaInitModule once after all variables are registered to
populate managed pointers
- Annotate managed globals in gpu.module with nvvm.managed for PTX
.attribute(.managed) generation
- Suppress cuf.data_transfer for assignments to/from non-allocatable
module managed variables, since cudaMemcpy would target the shadow
address rather than the actual unified memory
- Preserve cuf.data_transfer for device_var = managed_var assignments
where explicit transfer is still required
---
flang-rt/lib/cuda/registration.cpp | 3 +
flang/include/flang/Evaluate/tools.h | 36 ++++++--
.../include/flang/Runtime/CUDA/registration.h | 5 ++
flang/lib/Optimizer/CodeGen/CodeGen.cpp | 9 ++
.../Transforms/CUDA/CUFAddConstructor.cpp | 84 +++++++++++++++----
.../Transforms/CUDA/CUFOpConversionLate.cpp | 22 ++++-
flang/test/Fir/CUDA/cuda-code-gen.mlir | 16 ++++
flang/test/Fir/CUDA/cuda-constructor-2.f90 | 37 +++++++-
flang/test/Fir/CUDA/cuda-device-address.mlir | 39 +++++++++
flang/test/Lower/CUDA/cuda-data-transfer.cuf | 36 ++++++++
10 files changed, 265 insertions(+), 22 deletions(-)
diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp
index 8123220c2624c..58077d6a6a52b 100644
--- a/flang-rt/lib/cuda/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -27,6 +27,7 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
extern void __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
int ext, size_t size, int constant, int global);
+extern char __cudaInitModule(void **fatCubinHandle);
void *RTDECL(CUFRegisterModule)(void *data) {
void **fatHandle{__cudaRegisterFatBinary(data)};
@@ -50,6 +51,8 @@ void RTDEF(CUFRegisterManagedVariable)(
__cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
}
+void RTDEF(CUFInitModule)(void **module) { __cudaInitModule(module); }
+
} // extern "C"
} // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 963452755064d..51dc0582fcdea 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1311,6 +1311,28 @@ inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
return false;
}
+// Non-allocatable module-level managed/unified variables use pointer
+// indirection through a companion global in __nv_managed_data__.
+// Explicit data transfers (cudaMemcpy) must be avoided for these
+// variables since they would target the shadow address rather than
+// the actual unified memory address.
+inline bool IsNonAllocatableModuleCUDAManagedSymbol(const Symbol &sym) {
+ const Symbol &ultimate = sym.GetUltimate();
+ if (!IsCUDAManagedOrUnifiedSymbol(ultimate))
+ return false;
+ if (ultimate.attrs().test(semantics::Attr::ALLOCATABLE))
+ return false;
+ return ultimate.owner().IsModule();
+}
+
+template <typename A>
+inline bool HasNonAllocatableModuleCUDAManagedSymbols(const A &expr) {
+ for (const Symbol &sym : CollectCudaSymbols(expr))
+ if (IsNonAllocatableModuleCUDAManagedSymbol(sym))
+ return true;
+ return false;
+}
+
// Get the number of distinct symbols with CUDA device
// attribute in the expression.
template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
@@ -1350,17 +1372,21 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
int rhsNbManagedSymbols{GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
+ if (HasNonAllocatableModuleCUDAManagedSymbols(lhs))
+ return false;
+
if (lhsNbManagedSymbols >= 1 && lhs.Rank() > 0 && rhsNbSymbols == 0 &&
rhsNbManagedSymbols == 0 && (IsVariable(rhs) || IsConstantExpr(rhs))) {
return true; // Managed arrays initialization is performed on the device.
}
- // Special cases performed on the host:
- // - Only managed or unifed symbols are involved on RHS and LHS.
- // - LHS is managed or unified and the RHS is host only.
+ // Cases where no explicit data transfer is needed:
+ // - Both sides involve only managed/unified symbols (host-accessible).
+ // - LHS is host-only and RHS has only managed/unified symbols.
+ // - LHS is managed/unified and RHS is host-only.
if ((lhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
- (lhsNbManagedSymbols == 0 && rhsNbManagedSymbols >= 1 &&
- rhsNbManagedSymbols == rhsNbSymbols) ||
+ (lhsNbManagedSymbols == 0 && !HasCUDADeviceAttrs(lhs) &&
+ rhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
(lhsNbManagedSymbols >= 1 && rhsNbSymbols == 0)) {
return false;
}
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 15f013432fa04..74dbf9e189076 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -32,6 +32,11 @@ void RTDECL(CUFRegisterVariable)(
void RTDECL(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size);
+/// Initialize a CUDA module after all variables have been registered.
+/// Triggers the runtime to populate managed variable pointers with
+/// unified memory addresses.
+void RTDECL(CUFInitModule)(void **module);
+
} // extern "C"
} // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 25eb6194efa99..2d01463cf604d 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3448,6 +3448,15 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
g.setAddrSpace(
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
+ if (gpuMod && global.getDataAttr() &&
+ *global.getDataAttr() == cuf::DataAttribute::Managed &&
+ !mlir::isa<fir::BaseBoxType>(global.getType())) {
+ g.setAddrSpace(
+ static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Global));
+ g->setAttr(mlir::NVVM::NVVMDialect::getManagedAttrName(),
+ mlir::UnitAttr::get(global.getContext()));
+ }
+
rewriter.eraseOp(global);
return mlir::success();
}
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index baa8e591ee162..9ed76745c2610 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -41,6 +41,40 @@ namespace {
static constexpr llvm::StringRef cudaFortranCtorName{
"__cudaFortranConstructor"};
+static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
+
+/// Create an 8-byte pointer global in the __nv_managed_data__ section.
+/// The CUDA runtime populates this pointer with the unified memory address
+/// when the module is initialized via __cudaInitModule.
+static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
+ mlir::ModuleOp mod,
+ fir::GlobalOp globalOp) {
+ mlir::MLIRContext *ctx = mod.getContext();
+ std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
+ auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ builder.setInsertionPointAfter(globalOp);
+
+ llvm::SmallVector<mlir::NamedAttribute> attrs;
+ attrs.push_back(
+ mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
+ mlir::StringAttr::get(ctx, "__nv_managed_data__")));
+
+ mlir::DenseElementsAttr initAttr = {};
+ auto ptrGlobal = fir::GlobalOp::create(
+ builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
+ /*isTarget=*/false, ptrTy, initAttr,
+ /*linkName=*/builder.createInternalLinkage(), attrs);
+
+ mlir::Region ®ion = ptrGlobal.getRegion();
+ mlir::Block *block = builder.createBlock(®ion);
+ builder.setInsertionPointToStart(block);
+ mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
+ fir::HasValueOp::create(builder, globalOp.getLoc(), zero);
+
+ return ptrGlobal;
+}
struct CUFAddConstructor
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
@@ -108,19 +142,15 @@ struct CUFAddConstructor
if (!attr)
continue;
- if (attr.getValue() == cuf::DataAttribute::Managed &&
- !mlir::isa<fir::BaseBoxType>(globalOp.getType()))
- TODO(loc, "registration of non-allocatable managed variables");
+ bool isNonAllocManagedGlobal =
+ attr.getValue() == cuf::DataAttribute::Managed &&
+ !mlir::isa<fir::BaseBoxType>(globalOp.getType());
mlir::func::FuncOp func;
switch (attr.getValue()) {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant:
case cuf::DataAttribute::Managed: {
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
- loc, builder);
- auto fTy = func.getFunctionType();
-
// Global variable name
std::string gblNameStr = globalOp.getSymbol().getValue().str();
gblNameStr += '\0';
@@ -141,18 +171,44 @@ struct CUFAddConstructor
}
auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
- // Global variable address
- mlir::Value addr = fir::AddrOfOp::create(
- builder, loc, globalOp.resultType(), globalOp.getSymbol());
-
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
- fir::CallOp::create(builder, loc, func, args);
+ if (isNonAllocManagedGlobal) {
+ // Non-allocatable managed globals use pointer indirection:
+ // a companion pointer in __nv_managed_data__ holds the unified
+ // memory address, registered via __cudaRegisterManagedVar.
+ fir::GlobalOp ptrGlobal =
+ createManagedPointerGlobal(builder, mod, globalOp);
+ func = fir::runtime::getRuntimeFunc<mkRTKey(
+ CUFRegisterManagedVariable)>(loc, builder);
+ auto fTy = func.getFunctionType();
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+ } else {
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
+ loc, builder);
+ auto fTy = func.getFunctionType();
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, globalOp.resultType(), globalOp.getSymbol());
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+ }
} break;
default:
break;
}
}
+
+ // Initialize the module after all variables are registered so the
+ // runtime populates managed variable unified memory pointers.
+ mlir::func::FuncOp initFunc =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
+ auto initFTy = initFunc.getFunctionType();
+ llvm::SmallVector<mlir::Value> initArgs{
+ fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
+ fir::CallOp::create(builder, loc, initFunc, initArgs);
}
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
index fe459712a6ba4..62f95f5d23c34 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
@@ -13,6 +13,7 @@
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
@@ -48,6 +49,8 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
return val;
}
+static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
+
struct CUFDeviceAddressOpConversion
: public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
using OpRewritePattern::OpRewritePattern;
@@ -59,10 +62,25 @@ struct CUFDeviceAddressOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::DeviceAddressOp op,
mlir::PatternRewriter &rewriter) const override {
- if (auto global = symTab.lookup<fir::GlobalOp>(
- op.getHostSymbol().getRootReference().getValue())) {
+ auto symName = op.getHostSymbol().getRootReference().getValue();
+ if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
auto mod = op->getParentOfType<mlir::ModuleOp>();
mlir::Location loc = op.getLoc();
+
+ // For non-allocatable managed globals, CUFAddConstructor created a
+ // companion pointer global (@sym.managed.ptr) that holds the unified
+ // memory address. Load from it instead of calling CUFGetDeviceAddress.
+ std::string ptrGlobalName = (symName + managedPtrSuffix).str();
+ if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
+ auto ptrRef = fir::AddrOfOp::create(
+ rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
+ auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
+ auto converted =
+ fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
+ rewriter.replaceOp(op, converted);
+ return success();
+ }
+
auto hostAddr = fir::AddrOfOp::create(
rewriter, loc, fir::ReferenceType::get(global.getType()),
op.getHostSymbol());
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index e83648f21bdf1..fc962f8de5039 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -312,3 +312,19 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: gpu.func @_QMkernelsPassign
// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
+
+// -----
+
+// Test that non-allocatable managed globals inside gpu.module get
+// addr_space = 1 (Global) and the nvvm.managed annotation.
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+ %0 = fir.zero_bits !fir.array<100xi32>
+ fir.has_value %0 : !fir.array<100xi32>
+ }
+ }
+}
+
+// CHECK: llvm.mlir.global external @_QMtestEmanx() {addr_space = 1 : i32, nvvm.managed} : !llvm.array<100 x i32>
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index f21d8f9c37637..d61ca4849ec37 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
-//
+// CHECK: fir.call @_FortranACUFInitModule
// -----
@@ -78,3 +78,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
// CHECK: llvm.func internal @__cudaFortranConstructor()
// CHECK: fir.address_of(@_QMmEa00)
// CHECK: fir.call @_FortranACUFRegisterVariable
+// CHECK: fir.call @_FortranACUFInitModule
+
+// -----
+
+// Non-allocatable managed global: should create pointer global in
+// __nv_managed_data__ and register with CUFRegisterManagedVariable.
+//
+// Fortran source:
+// module test
+// integer*4, managed :: manx(100)
+// contains
+// attributes(global) subroutine kernel()
+// end subroutine
+// end module
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f32, dense<32> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+
+ fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+ %0 = fir.zero_bits !fir.array<100xi32>
+ fir.has_value %0 : !fir.array<100xi32>
+ }
+
+ gpu.module @cuda_device_mod {
+ }
+}
+
+// Pointer global should be created with section attribute.
+// CHECK: fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8>
+// CHECK: fir.zero_bits !fir.llvm_ptr<i8>
+
+// Constructor should register with CUFRegisterManagedVariable then init module.
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: fir.call @_FortranACUFRegisterManagedVariable
+// CHECK: fir.call @_FortranACUFInitModule
diff --git a/flang/test/Fir/CUDA/cuda-device-address.mlir b/flang/test/Fir/CUDA/cuda-device-address.mlir
index e86208321b8ab..a2dae71557869 100644
--- a/flang/test/Fir/CUDA/cuda-device-address.mlir
+++ b/flang/test/Fir/CUDA/cuda-device-address.mlir
@@ -12,3 +12,42 @@ func.func @_QPxa(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<d
// CHECK-LABEL: func.func @_QPxa
// CHECK: fir.call @_FortranACUFGetDeviceAddress
+
+// -----
+
+// Non-allocatable managed global with companion pointer global:
+// cuf.device_address should load from the pointer global instead of
+// calling CUFGetDeviceAddress.
+//
+// Fortran source:
+// module test
+// integer*4, managed :: manx(100)
+// end module
+// subroutine user()
+// use test
+// manx(1) = 42
+// end subroutine
+
+fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+ %0 = fir.zero_bits !fir.array<100xi32>
+ fir.has_value %0 : !fir.array<100xi32>
+}
+
+fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8> {
+ %0 = fir.zero_bits !fir.llvm_ptr<i8>
+ fir.has_value %0 : !fir.llvm_ptr<i8>
+}
+
+func.func @_QPuser() {
+ %c100 = arith.constant 100 : index
+ %0 = cuf.device_address @_QMtestEmanx -> !fir.ref<!fir.array<100xi32>>
+ %1 = fir.shape %c100 : (index) -> !fir.shape<1>
+ %2 = fir.declare %0(%1) {uniq_name = "_QMtestEmanx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+ return
+}
+
+// CHECK-LABEL: func.func @_QPuser
+// CHECK-NOT: fir.call @_FortranACUFGetDeviceAddress
+// CHECK: %[[PTR_REF:.*]] = fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: %[[RAW_PTR:.*]] = fir.load %[[PTR_REF]] : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: %[[ADDR:.*]] = fir.convert %[[RAW_PTR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.array<100xi32>>
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 66c3a28f9aec4..1d0e510c110ee 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -637,3 +637,39 @@ end subroutine
! CHECK-LABEL: func.func @_QPsub34
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {hasManagedOrUnifedSymbols, transfer_kind = #cuf.cuda_transfer<host_device>} : f16, !fir.box<!fir.array<?xf16>>
+
+module managed_mod
+ integer, managed :: marray(10)
+end module
+
+subroutine sub35()
+ use managed_mod
+ integer :: host_arr(10)
+ marray = host_arr
+ marray = 0
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub35()
+! CHECK-NOT: cuf.data_transfer
+
+! Test that host_var = managed_module_var does NOT generate cuf.data_transfer
+! (managed memory is host-accessible, so direct assignment suffices).
+subroutine sub36()
+ use managed_mod
+ integer :: host_arr(10)
+ host_arr = marray
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub36()
+! CHECK-NOT: cuf.data_transfer
+
+! Test that device_var = managed_module_var DOES generate cuf.data_transfer
+! (device memory requires explicit cudaMemcpy).
+subroutine sub37()
+ use managed_mod
+ integer, device :: dev_arr(10)
+ dev_arr = marray
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub37()
+! CHECK: cuf.data_transfer
>From 73f0df7e8fb4e1951d0ea263603a998dbc012624 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 1 Apr 2026 10:04:40 -0700
Subject: [PATCH 2/2] Calling CUFInitModule conditionally only when
non-allocatable managed global is present
---
.../Transforms/CUDA/CUFAddConstructor.cpp | 20 +++++++++++--------
flang/test/Fir/CUDA/cuda-constructor-2.f90 | 4 ++--
2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index 9ed76745c2610..c1cb52d264afa 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -137,6 +137,7 @@ struct CUFAddConstructor
}
// Register variables
+ bool hasNonAllocManagedGlobal = false;
for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
auto attr = globalOp.getDataAttrAttr();
if (!attr)
@@ -172,6 +173,7 @@ struct CUFAddConstructor
auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
if (isNonAllocManagedGlobal) {
+ hasNonAllocManagedGlobal = true;
// Non-allocatable managed globals use pointer indirection:
// a companion pointer in __nv_managed_data__ holds the unified
// memory address, registered via __cudaRegisterManagedVar.
@@ -201,14 +203,16 @@ struct CUFAddConstructor
}
}
- // Initialize the module after all variables are registered so the
- // runtime populates managed variable unified memory pointers.
- mlir::func::FuncOp initFunc =
- fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
- auto initFTy = initFunc.getFunctionType();
- llvm::SmallVector<mlir::Value> initArgs{
- fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
- fir::CallOp::create(builder, loc, initFunc, initArgs);
+ if (hasNonAllocManagedGlobal) {
+ // Initialize the module after all variables are registered so the
+ // runtime populates managed variable unified memory pointers.
+ mlir::func::FuncOp initFunc =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
+ mlir::FunctionType initFTy = initFunc.getFunctionType();
+ llvm::SmallVector<mlir::Value> initArgs{fir::runtime::createArguments(
+ builder, loc, initFTy, registeredMod)};
+ fir::CallOp::create(builder, loc, initFunc, initArgs);
+ }
}
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index d61ca4849ec37..15f2f307b3a57 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
-// CHECK: fir.call @_FortranACUFInitModule
+// CHECK-NOT: fir.call @_FortranACUFInitModule
// -----
@@ -78,7 +78,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
// CHECK: llvm.func internal @__cudaFortranConstructor()
// CHECK: fir.address_of(@_QMmEa00)
// CHECK: fir.call @_FortranACUFRegisterVariable
-// CHECK: fir.call @_FortranACUFInitModule
+// CHECK-NOT: fir.call @_FortranACUFInitModule
// -----
More information about the llvm-commits
mailing list