[flang] [llvm] [flang][cuda] Support non-allocatable module-level managed variables (PR #189753)

Wed Apr 1 10:05:03 PDT 2026

https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/189753

>From 94cc58a8864c97a36d64742e17ebdcfd00c7620f Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 31 Mar 2026 13:57:36 -0700
Subject: [PATCH 1/2] [flang][cuda] Support non-allocatable module-level
 managed variables

Add support for non-allocatable module-level CUDA managed variables
using pointer indirection through a companion global in
__nv_managed_data__. The CUDA runtime populates this pointer with the
unified memory address via __cudaRegisterManagedVar and __cudaInitModule.

- Create a .managed.ptr companion global in the __nv_managed_data__
  section and register it with _FortranACUFRegisterManagedVariable
- Call __cudaInitModule once after all variables are registered to
  populate managed pointers
- Annotate managed globals in gpu.module with nvvm.managed for PTX
  .attribute(.managed) generation
- Suppress cuf.data_transfer for assignments to/from non-allocatable
  module managed variables, since cudaMemcpy would target the shadow
  address rather than the actual unified memory
- Preserve cuf.data_transfer for device_var = managed_var assignments
  where explicit transfer is still required
---
 flang-rt/lib/cuda/registration.cpp            |  3 +
 flang/include/flang/Evaluate/tools.h          | 36 ++++++--
 .../include/flang/Runtime/CUDA/registration.h |  5 ++
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  9 ++
 .../Transforms/CUDA/CUFAddConstructor.cpp     | 84 +++++++++++++++----
 .../Transforms/CUDA/CUFOpConversionLate.cpp   | 22 ++++-
 flang/test/Fir/CUDA/cuda-code-gen.mlir        | 16 ++++
 flang/test/Fir/CUDA/cuda-constructor-2.f90    | 37 +++++++-
 flang/test/Fir/CUDA/cuda-device-address.mlir  | 39 +++++++++
 flang/test/Lower/CUDA/cuda-data-transfer.cuf  | 36 ++++++++
 10 files changed, 265 insertions(+), 22 deletions(-)

diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp
index 8123220c2624c..58077d6a6a52b 100644
--- a/flang-rt/lib/cuda/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -27,6 +27,7 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
 extern void __cudaRegisterManagedVar(void **fatCubinHandle,
     void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
     int ext, size_t size, int constant, int global);
+extern char __cudaInitModule(void **fatCubinHandle);
 
 void *RTDECL(CUFRegisterModule)(void *data) {
   void **fatHandle{__cudaRegisterFatBinary(data)};
@@ -50,6 +51,8 @@ void RTDEF(CUFRegisterManagedVariable)(
   __cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
 }
 
+void RTDEF(CUFInitModule)(void **module) { __cudaInitModule(module); }
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 963452755064d..51dc0582fcdea 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1311,6 +1311,28 @@ inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
   return false;
 }
 
+// Non-allocatable module-level managed/unified variables use pointer
+// indirection through a companion global in __nv_managed_data__.
+// Explicit data transfers (cudaMemcpy) must be avoided for these
+// variables since they would target the shadow address rather than
+// the actual unified memory address.
+inline bool IsNonAllocatableModuleCUDAManagedSymbol(const Symbol &sym) {
+  const Symbol &ultimate = sym.GetUltimate();
+  if (!IsCUDAManagedOrUnifiedSymbol(ultimate))
+    return false;
+  if (ultimate.attrs().test(semantics::Attr::ALLOCATABLE))
+    return false;
+  return ultimate.owner().IsModule();
+}
+
+template <typename A>
+inline bool HasNonAllocatableModuleCUDAManagedSymbols(const A &expr) {
+  for (const Symbol &sym : CollectCudaSymbols(expr))
+    if (IsNonAllocatableModuleCUDAManagedSymbol(sym))
+      return true;
+  return false;
+}
+
 // Get the number of distinct symbols with CUDA device
 // attribute in the expression.
 template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
@@ -1350,17 +1372,21 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
   int rhsNbManagedSymbols{GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
   int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
 
+  if (HasNonAllocatableModuleCUDAManagedSymbols(lhs))
+    return false;
+
   if (lhsNbManagedSymbols >= 1 && lhs.Rank() > 0 && rhsNbSymbols == 0 &&
       rhsNbManagedSymbols == 0 && (IsVariable(rhs) || IsConstantExpr(rhs))) {
     return true; // Managed arrays initialization is performed on the device.
   }
 
-  // Special cases performed on the host:
-  // - Only managed or unifed symbols are involved on RHS and LHS.
-  // - LHS is managed or unified and the RHS is host only.
+  // Cases where no explicit data transfer is needed:
+  // - Both sides involve only managed/unified symbols (host-accessible).
+  // - LHS is host-only and RHS has only managed/unified symbols.
+  // - LHS is managed/unified and RHS is host-only.
   if ((lhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
-      (lhsNbManagedSymbols == 0 && rhsNbManagedSymbols >= 1 &&
-          rhsNbManagedSymbols == rhsNbSymbols) ||
+      (lhsNbManagedSymbols == 0 && !HasCUDADeviceAttrs(lhs) &&
+          rhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
       (lhsNbManagedSymbols >= 1 && rhsNbSymbols == 0)) {
     return false;
   }
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 15f013432fa04..74dbf9e189076 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -32,6 +32,11 @@ void RTDECL(CUFRegisterVariable)(
 void RTDECL(CUFRegisterManagedVariable)(
     void **module, void **varSym, char *varName, int64_t size);
 
+/// Initialize a CUDA module after all variables have been registered.
+/// Triggers the runtime to populate managed variable pointers with
+/// unified memory addresses.
+void RTDECL(CUFInitModule)(void **module);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 25eb6194efa99..2d01463cf604d 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3448,6 +3448,15 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
       g.setAddrSpace(
           static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
 
+    if (gpuMod && global.getDataAttr() &&
+        *global.getDataAttr() == cuf::DataAttribute::Managed &&
+        !mlir::isa<fir::BaseBoxType>(global.getType())) {
+      g.setAddrSpace(
+          static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Global));
+      g->setAttr(mlir::NVVM::NVVMDialect::getManagedAttrName(),
+                 mlir::UnitAttr::get(global.getContext()));
+    }
+
     rewriter.eraseOp(global);
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index baa8e591ee162..9ed76745c2610 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -41,6 +41,40 @@ namespace {
 
 static constexpr llvm::StringRef cudaFortranCtorName{
     "__cudaFortranConstructor"};
+static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
+
+/// Create an 8-byte pointer global in the __nv_managed_data__ section.
+/// The CUDA runtime populates this pointer with the unified memory address
+/// when the module is initialized via __cudaInitModule.
+static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
+                                                mlir::ModuleOp mod,
+                                                fir::GlobalOp globalOp) {
+  mlir::MLIRContext *ctx = mod.getContext();
+  std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
+  auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointAfter(globalOp);
+
+  llvm::SmallVector<mlir::NamedAttribute> attrs;
+  attrs.push_back(
+      mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
+                           mlir::StringAttr::get(ctx, "__nv_managed_data__")));
+
+  mlir::DenseElementsAttr initAttr = {};
+  auto ptrGlobal = fir::GlobalOp::create(
+      builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
+      /*isTarget=*/false, ptrTy, initAttr,
+      /*linkName=*/builder.createInternalLinkage(), attrs);
+
+  mlir::Region &region = ptrGlobal.getRegion();
+  mlir::Block *block = builder.createBlock(&region);
+  builder.setInsertionPointToStart(block);
+  mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
+  fir::HasValueOp::create(builder, globalOp.getLoc(), zero);
+
+  return ptrGlobal;
+}
 
 struct CUFAddConstructor
     : public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
@@ -108,19 +142,15 @@ struct CUFAddConstructor
         if (!attr)
           continue;
 
-        if (attr.getValue() == cuf::DataAttribute::Managed &&
-            !mlir::isa<fir::BaseBoxType>(globalOp.getType()))
-          TODO(loc, "registration of non-allocatable managed variables");
+        bool isNonAllocManagedGlobal =
+            attr.getValue() == cuf::DataAttribute::Managed &&
+            !mlir::isa<fir::BaseBoxType>(globalOp.getType());
 
         mlir::func::FuncOp func;
         switch (attr.getValue()) {
         case cuf::DataAttribute::Device:
         case cuf::DataAttribute::Constant:
         case cuf::DataAttribute::Managed: {
-          func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
-              loc, builder);
-          auto fTy = func.getFunctionType();
-
           // Global variable name
           std::string gblNameStr = globalOp.getSymbol().getValue().str();
           gblNameStr += '\0';
@@ -141,18 +171,44 @@ struct CUFAddConstructor
           }
           auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
 
-          // Global variable address
-          mlir::Value addr = fir::AddrOfOp::create(
-              builder, loc, globalOp.resultType(), globalOp.getSymbol());
-
-          llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-              builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
-          fir::CallOp::create(builder, loc, func, args);
+          if (isNonAllocManagedGlobal) {
+            // Non-allocatable managed globals use pointer indirection:
+            // a companion pointer in __nv_managed_data__ holds the unified
+            // memory address, registered via __cudaRegisterManagedVar.
+            fir::GlobalOp ptrGlobal =
+                createManagedPointerGlobal(builder, mod, globalOp);
+            func = fir::runtime::getRuntimeFunc<mkRTKey(
+                CUFRegisterManagedVariable)>(loc, builder);
+            auto fTy = func.getFunctionType();
+            mlir::Value addr = fir::AddrOfOp::create(
+                builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
+            llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+                builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+            fir::CallOp::create(builder, loc, func, args);
+          } else {
+            func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
+                loc, builder);
+            auto fTy = func.getFunctionType();
+            mlir::Value addr = fir::AddrOfOp::create(
+                builder, loc, globalOp.resultType(), globalOp.getSymbol());
+            llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+                builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+            fir::CallOp::create(builder, loc, func, args);
+          }
         } break;
         default:
           break;
         }
       }
+
+      // Initialize the module after all variables are registered so the
+      // runtime populates managed variable unified memory pointers.
+      mlir::func::FuncOp initFunc =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
+      auto initFTy = initFunc.getFunctionType();
+      llvm::SmallVector<mlir::Value> initArgs{
+          fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
+      fir::CallOp::create(builder, loc, initFunc, initArgs);
     }
     mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
 
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
index fe459712a6ba4..62f95f5d23c34 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
@@ -13,6 +13,7 @@
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -48,6 +49,8 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
   return val;
 }
 
+static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
+
 struct CUFDeviceAddressOpConversion
     : public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -59,10 +62,25 @@ struct CUFDeviceAddressOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::DeviceAddressOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    if (auto global = symTab.lookup<fir::GlobalOp>(
-            op.getHostSymbol().getRootReference().getValue())) {
+    auto symName = op.getHostSymbol().getRootReference().getValue();
+    if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
       auto mod = op->getParentOfType<mlir::ModuleOp>();
       mlir::Location loc = op.getLoc();
+
+      // For non-allocatable managed globals, CUFAddConstructor created a
+      // companion pointer global (@sym.managed.ptr) that holds the unified
+      // memory address. Load from it instead of calling CUFGetDeviceAddress.
+      std::string ptrGlobalName = (symName + managedPtrSuffix).str();
+      if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
+        auto ptrRef = fir::AddrOfOp::create(
+            rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
+        auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
+        auto converted =
+            fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
+        rewriter.replaceOp(op, converted);
+        return success();
+      }
+
       auto hostAddr = fir::AddrOfOp::create(
           rewriter, loc, fir::ReferenceType::get(global.getType()),
           op.getHostSymbol());
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index e83648f21bdf1..fc962f8de5039 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -312,3 +312,19 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK-LABEL:  gpu.func @_QMkernelsPassign
 // CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
 // CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
+
+// -----
+
+// Test that non-allocatable managed globals inside gpu.module get
+// addr_space = 1 (Global) and the nvvm.managed annotation.
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+      %0 = fir.zero_bits !fir.array<100xi32>
+      fir.has_value %0 : !fir.array<100xi32>
+    }
+  }
+}
+
+// CHECK: llvm.mlir.global external @_QMtestEmanx() {addr_space = 1 : i32, nvvm.managed} : !llvm.array<100 x i32>
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index f21d8f9c37637..d61ca4849ec37 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
 // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
-//
+// CHECK: fir.call @_FortranACUFInitModule
 
 // -----
 
@@ -78,3 +78,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
 // CHECK: llvm.func internal @__cudaFortranConstructor()
 // CHECK: fir.address_of(@_QMmEa00)
 // CHECK: fir.call @_FortranACUFRegisterVariable
+// CHECK: fir.call @_FortranACUFInitModule
+
+// -----
+
+// Non-allocatable managed global: should create pointer global in
+// __nv_managed_data__ and register with CUFRegisterManagedVariable.
+//
+// Fortran source:
+//   module test
+//     integer*4, managed :: manx(100)
+//   contains
+//     attributes(global) subroutine kernel()
+//     end subroutine
+//   end module
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f32, dense<32> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+
+  fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+    %0 = fir.zero_bits !fir.array<100xi32>
+    fir.has_value %0 : !fir.array<100xi32>
+  }
+
+  gpu.module @cuda_device_mod {
+  }
+}
+
+// Pointer global should be created with section attribute.
+// CHECK: fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8>
+// CHECK:   fir.zero_bits !fir.llvm_ptr<i8>
+
+// Constructor should register with CUFRegisterManagedVariable then init module.
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: fir.call @_FortranACUFRegisterManagedVariable
+// CHECK: fir.call @_FortranACUFInitModule
diff --git a/flang/test/Fir/CUDA/cuda-device-address.mlir b/flang/test/Fir/CUDA/cuda-device-address.mlir
index e86208321b8ab..a2dae71557869 100644
--- a/flang/test/Fir/CUDA/cuda-device-address.mlir
+++ b/flang/test/Fir/CUDA/cuda-device-address.mlir
@@ -12,3 +12,42 @@ func.func @_QPxa(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<d
 
 // CHECK-LABEL: func.func @_QPxa
 // CHECK: fir.call @_FortranACUFGetDeviceAddress
+
+// -----
+
+// Non-allocatable managed global with companion pointer global:
+// cuf.device_address should load from the pointer global instead of
+// calling CUFGetDeviceAddress.
+//
+// Fortran source:
+//   module test
+//     integer*4, managed :: manx(100)
+//   end module
+//   subroutine user()
+//     use test
+//     manx(1) = 42
+//   end subroutine
+
+fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
+  %0 = fir.zero_bits !fir.array<100xi32>
+  fir.has_value %0 : !fir.array<100xi32>
+}
+
+fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8> {
+  %0 = fir.zero_bits !fir.llvm_ptr<i8>
+  fir.has_value %0 : !fir.llvm_ptr<i8>
+}
+
+func.func @_QPuser() {
+  %c100 = arith.constant 100 : index
+  %0 = cuf.device_address @_QMtestEmanx -> !fir.ref<!fir.array<100xi32>>
+  %1 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %2 = fir.declare %0(%1) {uniq_name = "_QMtestEmanx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+  return
+}
+
+// CHECK-LABEL: func.func @_QPuser
+// CHECK-NOT: fir.call @_FortranACUFGetDeviceAddress
+// CHECK: %[[PTR_REF:.*]] = fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: %[[RAW_PTR:.*]] = fir.load %[[PTR_REF]] : !fir.ref<!fir.llvm_ptr<i8>>
+// CHECK: %[[ADDR:.*]] = fir.convert %[[RAW_PTR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.array<100xi32>>
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 66c3a28f9aec4..1d0e510c110ee 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -637,3 +637,39 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub34
 ! CHECK:  cuf.data_transfer %{{.*}} to %{{.*}} {hasManagedOrUnifedSymbols, transfer_kind = #cuf.cuda_transfer<host_device>} : f16, !fir.box<!fir.array<?xf16>>
+
+module managed_mod
+  integer, managed :: marray(10)
+end module
+
+subroutine sub35()
+  use managed_mod
+  integer :: host_arr(10)
+  marray = host_arr
+  marray = 0
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub35()
+! CHECK-NOT: cuf.data_transfer
+
+! Test that host_var = managed_module_var does NOT generate cuf.data_transfer
+! (managed memory is host-accessible, so direct assignment suffices).
+subroutine sub36()
+  use managed_mod
+  integer :: host_arr(10)
+  host_arr = marray
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub36()
+! CHECK-NOT: cuf.data_transfer
+
+! Test that device_var = managed_module_var DOES generate cuf.data_transfer
+! (device memory requires explicit cudaMemcpy).
+subroutine sub37()
+  use managed_mod
+  integer, device :: dev_arr(10)
+  dev_arr = marray
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub37()
+! CHECK: cuf.data_transfer

>From 73f0df7e8fb4e1951d0ea263603a998dbc012624 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 1 Apr 2026 10:04:40 -0700
Subject: [PATCH 2/2] Calling CUFInitModule conditionally only when
 non-allocatable managed global is present

---
 .../Transforms/CUDA/CUFAddConstructor.cpp     | 20 +++++++++++--------
 flang/test/Fir/CUDA/cuda-constructor-2.f90    |  4 ++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index 9ed76745c2610..c1cb52d264afa 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -137,6 +137,7 @@ struct CUFAddConstructor
       }
 
       // Register variables
+      bool hasNonAllocManagedGlobal = false;
       for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
         auto attr = globalOp.getDataAttrAttr();
         if (!attr)
@@ -172,6 +173,7 @@ struct CUFAddConstructor
           auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
 
           if (isNonAllocManagedGlobal) {
+            hasNonAllocManagedGlobal = true;
             // Non-allocatable managed globals use pointer indirection:
             // a companion pointer in __nv_managed_data__ holds the unified
             // memory address, registered via __cudaRegisterManagedVar.
@@ -201,14 +203,16 @@ struct CUFAddConstructor
         }
       }
 
-      // Initialize the module after all variables are registered so the
-      // runtime populates managed variable unified memory pointers.
-      mlir::func::FuncOp initFunc =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
-      auto initFTy = initFunc.getFunctionType();
-      llvm::SmallVector<mlir::Value> initArgs{
-          fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
-      fir::CallOp::create(builder, loc, initFunc, initArgs);
+      if (hasNonAllocManagedGlobal) {
+        // Initialize the module after all variables are registered so the
+        // runtime populates managed variable unified memory pointers.
+        mlir::func::FuncOp initFunc =
+            fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
+        mlir::FunctionType initFTy = initFunc.getFunctionType();
+        llvm::SmallVector<mlir::Value> initArgs{fir::runtime::createArguments(
+            builder, loc, initFTy, registeredMod)};
+        fir::CallOp::create(builder, loc, initFunc, initArgs);
+      }
     }
     mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
 
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index d61ca4849ec37..15f2f307b3a57 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
 // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
-// CHECK: fir.call @_FortranACUFInitModule
+// CHECK-NOT: fir.call @_FortranACUFInitModule
 
 // -----
 
@@ -78,7 +78,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
 // CHECK: llvm.func internal @__cudaFortranConstructor()
 // CHECK: fir.address_of(@_QMmEa00)
 // CHECK: fir.call @_FortranACUFRegisterVariable
-// CHECK: fir.call @_FortranACUFInitModule
+// CHECK-NOT: fir.call @_FortranACUFInitModule
 
 // -----