[flang-commits] [flang] [llvm] [flang][CUDA] Support module-scope variables in `global` kernels under `-gpu=mem:unified` (PR #196228)

Wed May 6 20:35:30 PDT 2026

https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/196228

Under `-gpu=mem:unified`, plain Fortran module-scope variables referenced directly from a `global` kernel previously produced wrong results. This adds a `cuda-unified` option to the CUF passes:

- CUFDeviceGlobal: when set, plain (un-attributed, non-constant) module globals are mirrored into the GPU module as no-body declarations, so PTX emits `.extern .global ...`.
- CUFAddConstructor: when set, emits a CUFRegisterExternalVariable call for each such global from `__cudaFortranConstructor`.
- New runtime entry `CUFRegisterExternalVariable` wraps `__cudaRegisterHostVar` so the CUDA driver maps the device extern to the host pointer at module-load time. HMM/ATS handles migration from there.

>From 70a34253de49dd6827b8d3238c7bb9b70420e00a Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 6 May 2026 20:25:47 -0700
Subject: [PATCH 1/2] Support module-scope variables in global kernels under
 -gpu=mem:unified

---
 flang-rt/lib/cuda/registration.cpp            | 15 ++++
 .../flang/Optimizer/Transforms/Passes.td      | 15 +++-
 .../include/flang/Runtime/CUDA/registration.h |  7 ++
 .../Transforms/CUDA/CUFAddConstructor.cpp     | 73 ++++++++++++++++++-
 .../Transforms/CUDA/CUFDeviceGlobal.cpp       | 20 ++++-
 flang/test/Fir/CUDA/cuda-constructor-2.f90    | 56 ++++++++++++++
 flang/test/Fir/CUDA/cuda-device-global.f90    | 40 ++++++++++
 .../Fir/CUDA/cuda-unified-module-global.f90   | 50 +++++++++++++
 8 files changed, 271 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Fir/CUDA/cuda-unified-module-global.f90

diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp
index 58077d6a6a52b..1cfab3d2b20bd 100644
--- a/flang-rt/lib/cuda/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -27,6 +27,12 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
 extern void __cudaRegisterManagedVar(void **fatCubinHandle,
     void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
     int ext, size_t size, int constant, int global);
+// __cudaRegisterHostVar registers a host-resident variable so that a
+// device-side reference of the same name resolves to the host pointer at
+// module-load time. Used for -gpu=mem:unified so kernel accesses go through
+// the host storage and HMM/ATS handles migration. Exported from libcudart.
+extern void __cudaRegisterHostVar(void **fatCubinHandle, const char *deviceName,
+    char *hostVar, size_t size);
 extern char __cudaInitModule(void **fatCubinHandle);
 
 void *RTDECL(CUFRegisterModule)(void *data) {
@@ -46,6 +52,15 @@ void RTDEF(CUFRegisterVariable)(
   __cudaRegisterVar(module, varSym, varName, varName, 0, size, 0, 0);
 }
 
+void RTDEF(CUFRegisterExternalVariable)(
+    void **module, char *varSym, const char *varName, int64_t size) {
+  // Tell the CUDA driver to bind the device-side global <varName> to the
+  // host-resident storage at <varSym>. Kernel accesses to <varName> then go
+  // through the host address; HMM/ATS handles migration. Mirrors classic
+  // nvfortran's walk_cuda_register() under DATA_UNIFIED.
+  __cudaRegisterHostVar(module, varName, varSym, size);
+}
+
 void RTDEF(CUFRegisterManagedVariable)(
     void **module, void **varSym, char *varName, int64_t size) {
   __cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e107672adf907..d9072e7aab4f7 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -517,7 +517,12 @@ def CUFDeviceGlobal :
   let options = [
     Option<"skipDeadDeclares", "skip-dead-declares", "bool",
            /*default=*/"true",
-           "Skip globals whose only use is a dead fir.declare">
+           "Skip globals whose only use is a dead fir.declare">,
+    Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+           "Treat host module globals as unified memory (-gpu=mem:unified): "
+           "clone them into the GPU module as external declarations so PTX "
+           "emits `.extern .global ...`. The CUDA runtime maps the device "
+           "extern to the host pointer via __cudaRegisterHostVar.">
   ];
 }
 
@@ -526,6 +531,14 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
   let dependentDialects = [
     "cuf::CUFDialect", "mlir::func::FuncDialect", "mlir::DLTIDialect"
   ];
+  let options = [
+    Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+           "Treat host module globals as unified memory (-gpu=mem:unified): "
+           "register every plain host module global that is mirrored in the "
+           "GPU module via CUFRegisterExternalVariable, so the CUDA driver "
+           "maps the device-side symbol to the host pointer at module-load "
+           "time and HMM/ATS handles migration.">
+  ];
 }
 
 def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 74dbf9e189076..48a74dc1d6d03 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -28,6 +28,13 @@ void RTDECL(CUFRegisterFunction)(
 void RTDECL(CUFRegisterVariable)(
     void **module, char *varSym, const char *varName, int64_t size);
 
+/// Register a module-scope variable as host-resident under -gpu=mem:unified,
+/// so that the device-side symbol of the same name is mapped to the host
+/// pointer at module-load time. Wraps __cudaRegisterHostVar. Kernel accesses
+/// to the variable then reach the host storage directly via HMM/ATS.
+void RTDECL(CUFRegisterExternalVariable)(
+    void **module, char *varSym, const char *varName, int64_t size);
+
 /// Register a managed variable.
 void RTDECL(CUFRegisterManagedVariable)(
     void **module, void **varSym, char *varName, int64_t size);
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index 248905fdc70d5..6c97346b89d60 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Runtime/CUDA/registration.h"
 #include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
@@ -76,12 +77,34 @@ static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
   return ptrGlobal;
 }
 
+/// Return true if \p hostGlobal is a host module-scope global that has been
+/// mirrored in the GPU module as an external (no-body) declaration by the
+/// CUFDeviceGlobal pass under -gpu=mem:unified. Such globals must be
+/// registered with the CUDA driver via CUFRegisterExternalVariable so the
+/// device-side `.extern` symbol resolves to the host pointer at module-load
+/// time and HMM/ATS handles migration.
+static bool isCudaUnifiedExternalGlobal(fir::GlobalOp hostGlobal,
+                                        mlir::SymbolTable &gpuSymTable) {
+  if (hostGlobal.getDataAttrAttr())
+    return false;
+  if (hostGlobal.getConstant())
+    return false;
+  auto gpuGlobal = gpuSymTable.lookup<fir::GlobalOp>(hostGlobal.getSymName());
+  if (!gpuGlobal)
+    return false;
+  return !gpuGlobal.isInitialized();
+}
+
 static bool hasRegisteredGlobals(mlir::ModuleOp mod,
-                                 mlir::SymbolTable gpuSymTable) {
+                                 mlir::SymbolTable gpuSymTable,
+                                 bool cudaUnified) {
   for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
     auto attr = globalOp.getDataAttrAttr();
-    if (!attr)
+    if (!attr) {
+      if (cudaUnified && isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+        return true;
       continue;
+    }
     if (!gpuSymTable.lookup(globalOp.getSymName()))
       continue;
     if (attr.getValue() == cuf::DataAttribute::Managed &&
@@ -110,6 +133,8 @@ static bool hasKernel(mlir::gpu::GPUModuleOp gpuMod) {
 struct CUFAddConstructor
     : public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
 
+  using CUFAddConstructorBase::CUFAddConstructorBase;
+
   void runOnOperation() override {
     mlir::ModuleOp mod = getOperation();
     mlir::SymbolTable symTab(mod);
@@ -151,7 +176,8 @@ struct CUFAddConstructor
     if (gpuMod) {
       mlir::SymbolTable gpuSymTable(gpuMod);
       bool needsModuleRegistration =
-          hasKernel(gpuMod) || hasRegisteredGlobals(mod, gpuSymTable);
+          hasKernel(gpuMod) ||
+          hasRegisteredGlobals(mod, gpuSymTable, cudaUnified);
       if (needsModuleRegistration) {
         auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
         auto registeredMod = cuf::RegisterModuleOp::create(
@@ -241,6 +267,47 @@ struct CUFAddConstructor
           }
         }
 
+        // Register externally-linked module globals under -gpu=mem:unified.
+        // CUFDeviceGlobal cloned them into the GPU module with external
+        // linkage so PTX emits .extern; the CUDA driver patches the device
+        // reference to the host pointer at module-load time after this call.
+        // Works uniformly for fixed-shape (e.g. fir.array<5xi32>) and
+        // allocatable (fir.box<fir.heap<...>>) module globals -- the size
+        // computation is the same as the managed path above.
+        if (cudaUnified) {
+          for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
+            if (!isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+              continue;
+
+            std::string gblNameStr = globalOp.getSymbol().getValue().str();
+            gblNameStr += '\0';
+            mlir::Value gblName = fir::getBase(
+                fir::factory::createStringLiteral(builder, loc, gblNameStr));
+
+            std::optional<uint64_t> size;
+            if (auto boxTy =
+                    mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
+              mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
+              size = dl->getTypeSizeInBits(structTy) / 8;
+            }
+            if (!size) {
+              size = fir::getTypeSizeAndAlignmentOrCrash(
+                         loc, globalOp.getType(), *dl, kindMap)
+                         .first;
+            }
+            auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
+
+            mlir::func::FuncOp func = fir::runtime::getRuntimeFunc<mkRTKey(
+                CUFRegisterExternalVariable)>(loc, builder);
+            auto fTy = func.getFunctionType();
+            mlir::Value addr = fir::AddrOfOp::create(
+                builder, loc, globalOp.resultType(), globalOp.getSymbol());
+            llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+                builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+            fir::CallOp::create(builder, loc, func, args);
+          }
+        }
+
         if (hasNonAllocManagedGlobal) {
           // Initialize the module after all variables are registered so the
           // runtime populates managed variable unified memory pointers.
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
index 9970197627f29..326051327f86a 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
@@ -165,7 +165,25 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
       if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
         continue;
       }
-      gpuSymTable.insert(globalOp->clone());
+      auto *cloned = globalOp->clone();
+      // Under -gpu=mem:unified, plain host module-scope variables (no
+      // explicit CUF data attribute, not a constant) get a no-body
+      // declaration in the GPU module: clear the body, init value, and
+      // linkName. With no linkName, the LLVM lowering uses the default
+      // External linkage (see convertLinkage in CodeGen.cpp), so an
+      // initializer-less global emits as `.extern .global ...` in PTX.
+      // The host-side definition stays. CUFAddConstructor will emit
+      // CUFRegisterExternalVariable (= __cudaRegisterHostVar) so the CUDA
+      // runtime maps the device extern to the host pointer at module-load
+      // time, and HMM/ATS handles migration.
+      if (cudaUnified && !globalOp.getConstant() &&
+          !globalOp.getDataAttrAttr()) {
+        auto clonedGlobal = mlir::cast<fir::GlobalOp>(cloned);
+        clonedGlobal.getRegion().getBlocks().clear();
+        clonedGlobal.removeInitValAttr();
+        clonedGlobal.removeLinkNameAttr();
+      }
+      gpuSymTable.insert(cloned);
     }
   }
 };
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 452b89bea6b80..bb2a98c294558 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -1,4 +1,5 @@
 // RUN: fir-opt --split-input-file --cuf-add-constructor %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s --check-prefixes=CHECK,UNIFIED
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
 
@@ -201,3 +202,58 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr 
 // CHECK: fir.address_of(@_QMkernels_mEdev_var) : !fir.ref<f32> 
 // CHECK: fir.call @_FortranACUFRegisterVariable(%3, %4, %5, %6) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> () 
+
+// -----
+
+// Under -gpu=mem:unified, a non-allocatable host module global referenced from
+// device code (mirrored as an external-linkage clone in the GPU module by the
+// CUFDeviceGlobal pass) must be registered with
+// _FortranACUFRegisterExternalVariable so the device-side `.extern` symbol
+// resolves to the host pointer at module-load time.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmtestsPg1() kernel {
+      gpu.return
+    }
+    // External declaration: no body, no init value -- lowers to PTX
+    // `.extern .global ...`. CUFAddConstructor must register it so the
+    // CUDA runtime maps it to the host pointer at module-load time.
+    fir.global @_QMmtestsEm : !fir.array<5xi32>
+  }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
+
+// -----
+
+// Under -gpu=mem:unified, an allocatable host module global also gets
+// registered via _FortranACUFRegisterExternalVariable; the registered symbol
+// is the descriptor (fir.box<fir.heap<...>>). The host runtime allocates the
+// data buffer in HMM/ATS-accessible memory.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?xi32>>
+    %1 = fircg.ext_embox %0(%c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xi32>>, index) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+    fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?xi32>>>
+  }
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmtestsPg1() kernel {
+      gpu.return
+    }
+    fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>>
+  }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEma) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index a35ab6d2ebc95..4f4dd222d9eae 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -1,5 +1,6 @@
 
 // RUN: fir-opt --split-input-file --cuf-device-global %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-device-global="cuda-unified=true" %s | FileCheck %s --check-prefix=UNIFIED
 
 
 module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
@@ -84,3 +85,42 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta
 // CHECK-DAG: fir.global @_QMmEa
 // CHECK-DAG: fir.global @_QMmEb
 // CHECK-DAG: fir.global @_QMmEc
+
+// -----
+
+// Under -gpu=mem:unified (cuda-unified=true), plain host module-scope
+// variables referenced from device code are mirrored as no-body external
+// declarations in the GPU module. PTX lowers them as `.extern .global ...`.
+// CUFAddConstructor + the runtime then map the device-side extern to the
+// host pointer via __cudaRegisterHostVar.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+  fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+  func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+    return
+  }
+}
+
+// Host-side definition is preserved.
+// UNIFIED: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+// GPU-module clone is an external declaration (no init body, no `dense<...>`).
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// UNIFIED-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// -----
+
+// Globals with an explicit CUF data attribute (device, managed, constant)
+// keep their existing definition-clone path even with cuda-unified=true.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+  fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
+  func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    %0 = fir.address_of(@_QMmtestsEdev) : !fir.ref<!fir.array<3xi32>>
+    return
+  }
+}
+
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
diff --git a/flang/test/Fir/CUDA/cuda-unified-module-global.f90 b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
new file mode 100644
index 0000000000000..360401ac6c523
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
@@ -0,0 +1,50 @@
+// End-to-end check that under -gpu=mem:unified, a plain host module-scope
+// variable referenced from a global kernel (issue 2573):
+//   1. is mirrored into the GPU module by CUFDeviceGlobal so nvlink can
+//      resolve the device-side reference; and
+//   2. is registered with the CUDA driver via
+//      _FortranACUFRegisterExternalVariable (= __cudaRegisterHostVar) from
+//      __cudaFortranConstructor, so the device-side symbol is mapped to the
+//      host-resident storage at module-load time and HMM/ATS handles
+//      migration -- avoiding the bug where host and device kept separate
+//      copies that never synced.
+//
+// Pipeline: cuf-device-global with cuda-unified=true (clones the host
+// global into the GPU module as an external declaration), then
+// cuf-add-constructor with cuda-unified=true (emits the registration call
+// for the cloned global).
+
+// RUN: fir-opt --cuf-device-global="cuda-unified=true" --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+  func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+    return
+  }
+
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmtestsPg1() kernel {
+      gpu.return
+    }
+  }
+}
+
+// Host-side definition is preserved.
+// CHECK: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+// GPU module gets an external declaration (no body, no init). PTX lowers
+// it as `.extern .global ...`; nvlink permits the extern because acclnk
+// is invoked with -unifiedmem -init=unified -cudalink. The constructor
+// below registers the host pointer via the CUDA driver.
+// CHECK: gpu.module @cuda_device_mod
+// CHECK: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// CHECK-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// Constructor registers the host pointer.
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// CHECK: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// CHECK: fir.call @_FortranACUFRegisterExternalVariable
+// CHECK-NOT: fir.call @_FortranACUFInitModule

>From ae174242e0fb0299d25cf95c9b5f30146d621040 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 6 May 2026 20:35:01 -0700
Subject: [PATCH 2/2] update test

---
 flang/test/Fir/CUDA/cuda-unified-module-global.f90 | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/flang/test/Fir/CUDA/cuda-unified-module-global.f90 b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
index 360401ac6c523..fc34c9c4686e6 100644
--- a/flang/test/Fir/CUDA/cuda-unified-module-global.f90
+++ b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
@@ -1,13 +1,12 @@
 // End-to-end check that under -gpu=mem:unified, a plain host module-scope
-// variable referenced from a global kernel (issue 2573):
-//   1. is mirrored into the GPU module by CUFDeviceGlobal so nvlink can
-//      resolve the device-side reference; and
+// variable referenced from a global kernel:
+//   1. is mirrored into the GPU module by CUFDeviceGlobal as a no-body
+//      external declaration (so PTX gets `.extern .global ...`); and
 //   2. is registered with the CUDA driver via
 //      _FortranACUFRegisterExternalVariable (= __cudaRegisterHostVar) from
-//      __cudaFortranConstructor, so the device-side symbol is mapped to the
-//      host-resident storage at module-load time and HMM/ATS handles
-//      migration -- avoiding the bug where host and device kept separate
-//      copies that never synced.
+//      __cudaFortranConstructor, so the device-side symbol is mapped to
+//      the host-resident storage at module-load time and HMM/ATS handles
+//      migration.
 //
 // Pipeline: cuf-device-global with cuda-unified=true (clones the host
 // global into the GPU module as an external declaration), then