[flang-commits] [flang] [llvm] [flang][CUDA] Support module-scope variables in `global` kernels under `-gpu=mem:unified` (PR #196228)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Wed May 6 20:35:30 PDT 2026
https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/196228
Under `-gpu=mem:unified`, plain Fortran module-scope variables referenced directly from a `global` kernel previously produced wrong results. This adds a `cuda-unified` option to the CUF passes:
- CUFDeviceGlobal: when set, plain (un-attributed, non-constant) module globals are mirrored into the GPU module as no-body declarations, so PTX emits `.extern .global ...`.
- CUFAddConstructor: when set, emits a CUFRegisterExternalVariable call for each such global from `__cudaFortranConstructor`.
- New runtime entry `CUFRegisterExternalVariable` wraps `__cudaRegisterHostVar` so the CUDA driver maps the device extern to the host pointer at module-load time. HMM/ATS handles migration from there.
>From 70a34253de49dd6827b8d3238c7bb9b70420e00a Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 6 May 2026 20:25:47 -0700
Subject: [PATCH 1/2] Support module-scope variables in global kernels under
-gpu=mem:unified
---
flang-rt/lib/cuda/registration.cpp | 15 ++++
.../flang/Optimizer/Transforms/Passes.td | 15 +++-
.../include/flang/Runtime/CUDA/registration.h | 7 ++
.../Transforms/CUDA/CUFAddConstructor.cpp | 73 ++++++++++++++++++-
.../Transforms/CUDA/CUFDeviceGlobal.cpp | 20 ++++-
flang/test/Fir/CUDA/cuda-constructor-2.f90 | 56 ++++++++++++++
flang/test/Fir/CUDA/cuda-device-global.f90 | 40 ++++++++++
.../Fir/CUDA/cuda-unified-module-global.f90 | 50 +++++++++++++
8 files changed, 271 insertions(+), 5 deletions(-)
create mode 100644 flang/test/Fir/CUDA/cuda-unified-module-global.f90
diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp
index 58077d6a6a52b..1cfab3d2b20bd 100644
--- a/flang-rt/lib/cuda/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -27,6 +27,12 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
extern void __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
int ext, size_t size, int constant, int global);
+// __cudaRegisterHostVar registers a host-resident variable so that a
+// device-side reference of the same name resolves to the host pointer at
+// module-load time. Used for -gpu=mem:unified so kernel accesses go through
+// the host storage and HMM/ATS handles migration. Exported from libcudart.
+extern void __cudaRegisterHostVar(void **fatCubinHandle, const char *deviceName,
+ char *hostVar, size_t size);
extern char __cudaInitModule(void **fatCubinHandle);
void *RTDECL(CUFRegisterModule)(void *data) {
@@ -46,6 +52,15 @@ void RTDEF(CUFRegisterVariable)(
__cudaRegisterVar(module, varSym, varName, varName, 0, size, 0, 0);
}
+void RTDEF(CUFRegisterExternalVariable)(
+ void **module, char *varSym, const char *varName, int64_t size) {
+ // Tell the CUDA driver to bind the device-side global <varName> to the
+ // host-resident storage at <varSym>. Kernel accesses to <varName> then go
+ // through the host address; HMM/ATS handles migration. Mirrors classic
+ // nvfortran's walk_cuda_register() under DATA_UNIFIED.
+ __cudaRegisterHostVar(module, varName, varSym, size);
+}
+
void RTDEF(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size) {
__cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e107672adf907..d9072e7aab4f7 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -517,7 +517,12 @@ def CUFDeviceGlobal :
let options = [
Option<"skipDeadDeclares", "skip-dead-declares", "bool",
/*default=*/"true",
- "Skip globals whose only use is a dead fir.declare">
+ "Skip globals whose only use is a dead fir.declare">,
+ Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+ "Treat host module globals as unified memory (-gpu=mem:unified): "
+ "clone them into the GPU module as external declarations so PTX "
+ "emits `.extern .global ...`. The CUDA runtime maps the device "
+ "extern to the host pointer via __cudaRegisterHostVar.">
];
}
@@ -526,6 +531,14 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
let dependentDialects = [
"cuf::CUFDialect", "mlir::func::FuncDialect", "mlir::DLTIDialect"
];
+ let options = [
+ Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+ "Treat host module globals as unified memory (-gpu=mem:unified): "
+ "register every plain host module global that is mirrored in the "
+ "GPU module via CUFRegisterExternalVariable, so the CUDA driver "
+ "maps the device-side symbol to the host pointer at module-load "
+ "time and HMM/ATS handles migration.">
+ ];
}
def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 74dbf9e189076..48a74dc1d6d03 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -28,6 +28,13 @@ void RTDECL(CUFRegisterFunction)(
void RTDECL(CUFRegisterVariable)(
void **module, char *varSym, const char *varName, int64_t size);
+/// Register a module-scope variable as host-resident under -gpu=mem:unified,
+/// so that the device-side symbol of the same name is mapped to the host
+/// pointer at module-load time. Wraps __cudaRegisterHostVar. Kernel accesses
+/// to the variable then reach the host storage directly via HMM/ATS.
+void RTDECL(CUFRegisterExternalVariable)(
+ void **module, char *varSym, const char *varName, int64_t size);
+
/// Register a managed variable.
void RTDECL(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size);
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index 248905fdc70d5..6c97346b89d60 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -20,6 +20,7 @@
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/DLTI/DLTI.h"
@@ -76,12 +77,34 @@ static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
return ptrGlobal;
}
+/// Return true if \p hostGlobal is a host module-scope global that has been
+/// mirrored in the GPU module as an external (no-body) declaration by the
+/// CUFDeviceGlobal pass under -gpu=mem:unified. Such globals must be
+/// registered with the CUDA driver via CUFRegisterExternalVariable so the
+/// device-side `.extern` symbol resolves to the host pointer at module-load
+/// time and HMM/ATS handles migration.
+static bool isCudaUnifiedExternalGlobal(fir::GlobalOp hostGlobal,
+ mlir::SymbolTable &gpuSymTable) {
+ if (hostGlobal.getDataAttrAttr())
+ return false;
+ if (hostGlobal.getConstant())
+ return false;
+ auto gpuGlobal = gpuSymTable.lookup<fir::GlobalOp>(hostGlobal.getSymName());
+ if (!gpuGlobal)
+ return false;
+ return !gpuGlobal.isInitialized();
+}
+
static bool hasRegisteredGlobals(mlir::ModuleOp mod,
- mlir::SymbolTable gpuSymTable) {
+ mlir::SymbolTable gpuSymTable,
+ bool cudaUnified) {
for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
auto attr = globalOp.getDataAttrAttr();
- if (!attr)
+ if (!attr) {
+ if (cudaUnified && isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+ return true;
continue;
+ }
if (!gpuSymTable.lookup(globalOp.getSymName()))
continue;
if (attr.getValue() == cuf::DataAttribute::Managed &&
@@ -110,6 +133,8 @@ static bool hasKernel(mlir::gpu::GPUModuleOp gpuMod) {
struct CUFAddConstructor
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
+ using CUFAddConstructorBase::CUFAddConstructorBase;
+
void runOnOperation() override {
mlir::ModuleOp mod = getOperation();
mlir::SymbolTable symTab(mod);
@@ -151,7 +176,8 @@ struct CUFAddConstructor
if (gpuMod) {
mlir::SymbolTable gpuSymTable(gpuMod);
bool needsModuleRegistration =
- hasKernel(gpuMod) || hasRegisteredGlobals(mod, gpuSymTable);
+ hasKernel(gpuMod) ||
+ hasRegisteredGlobals(mod, gpuSymTable, cudaUnified);
if (needsModuleRegistration) {
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
auto registeredMod = cuf::RegisterModuleOp::create(
@@ -241,6 +267,47 @@ struct CUFAddConstructor
}
}
+ // Register externally-linked module globals under -gpu=mem:unified.
+ // CUFDeviceGlobal cloned them into the GPU module with external
+ // linkage so PTX emits .extern; the CUDA driver patches the device
+ // reference to the host pointer at module-load time after this call.
+ // Works uniformly for fixed-shape (e.g. fir.array<5xi32>) and
+ // allocatable (fir.box<fir.heap<...>>) module globals -- the size
+ // computation is the same as the managed path above.
+ if (cudaUnified) {
+ for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
+ if (!isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+ continue;
+
+ std::string gblNameStr = globalOp.getSymbol().getValue().str();
+ gblNameStr += '\0';
+ mlir::Value gblName = fir::getBase(
+ fir::factory::createStringLiteral(builder, loc, gblNameStr));
+
+ std::optional<uint64_t> size;
+ if (auto boxTy =
+ mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
+ mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
+ size = dl->getTypeSizeInBits(structTy) / 8;
+ }
+ if (!size) {
+ size = fir::getTypeSizeAndAlignmentOrCrash(
+ loc, globalOp.getType(), *dl, kindMap)
+ .first;
+ }
+ auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
+
+ mlir::func::FuncOp func = fir::runtime::getRuntimeFunc<mkRTKey(
+ CUFRegisterExternalVariable)>(loc, builder);
+ auto fTy = func.getFunctionType();
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, globalOp.resultType(), globalOp.getSymbol());
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+ }
+ }
+
if (hasNonAllocManagedGlobal) {
// Initialize the module after all variables are registered so the
// runtime populates managed variable unified memory pointers.
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
index 9970197627f29..326051327f86a 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
@@ -165,7 +165,25 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
continue;
}
- gpuSymTable.insert(globalOp->clone());
+ auto *cloned = globalOp->clone();
+ // Under -gpu=mem:unified, plain host module-scope variables (no
+ // explicit CUF data attribute, not a constant) get a no-body
+ // declaration in the GPU module: clear the body, init value, and
+ // linkName. With no linkName, the LLVM lowering uses the default
+ // External linkage (see convertLinkage in CodeGen.cpp), so an
+ // initializer-less global emits as `.extern .global ...` in PTX.
+ // The host-side definition stays. CUFAddConstructor will emit
+ // CUFRegisterExternalVariable (= __cudaRegisterHostVar) so the CUDA
+ // runtime maps the device extern to the host pointer at module-load
+ // time, and HMM/ATS handles migration.
+ if (cudaUnified && !globalOp.getConstant() &&
+ !globalOp.getDataAttrAttr()) {
+ auto clonedGlobal = mlir::cast<fir::GlobalOp>(cloned);
+ clonedGlobal.getRegion().getBlocks().clear();
+ clonedGlobal.removeInitValAttr();
+ clonedGlobal.removeLinkNameAttr();
+ }
+ gpuSymTable.insert(cloned);
}
}
};
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 452b89bea6b80..bb2a98c294558 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -1,4 +1,5 @@
// RUN: fir-opt --split-input-file --cuf-add-constructor %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s --check-prefixes=CHECK,UNIFIED
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
@@ -201,3 +202,58 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr
// CHECK: fir.address_of(@_QMkernels_mEdev_var) : !fir.ref<f32>
// CHECK: fir.call @_FortranACUFRegisterVariable(%3, %4, %5, %6) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> ()
+
+// -----
+
+// Under -gpu=mem:unified, a non-allocatable host module global referenced from
+// device code (mirrored as an external-linkage clone in the GPU module by the
+// CUFDeviceGlobal pass) must be registered with
+// _FortranACUFRegisterExternalVariable so the device-side `.extern` symbol
+// resolves to the host pointer at module-load time.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ // External declaration: no body, no init value -- lowers to PTX
+ // `.extern .global ...`. CUFAddConstructor must register it so the
+ // CUDA runtime maps it to the host pointer at module-load time.
+ fir.global @_QMmtestsEm : !fir.array<5xi32>
+ }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
+
+// -----
+
+// Under -gpu=mem:unified, an allocatable host module global also gets
+// registered via _FortranACUFRegisterExternalVariable; the registered symbol
+// is the descriptor (fir.box<fir.heap<...>>). The host runtime allocates the
+// data buffer in HMM/ATS-accessible memory.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?xi32>>
+ %1 = fircg.ext_embox %0(%c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xi32>>, index) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?xi32>>>
+ }
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>>
+ }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEma) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index a35ab6d2ebc95..4f4dd222d9eae 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -1,5 +1,6 @@
// RUN: fir-opt --split-input-file --cuf-device-global %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-device-global="cuda-unified=true" %s | FileCheck %s --check-prefix=UNIFIED
module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
@@ -84,3 +85,42 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta
// CHECK-DAG: fir.global @_QMmEa
// CHECK-DAG: fir.global @_QMmEb
// CHECK-DAG: fir.global @_QMmEc
+
+// -----
+
+// Under -gpu=mem:unified (cuda-unified=true), plain host module-scope
+// variables referenced from device code are mirrored as no-body external
+// declarations in the GPU module. PTX lowers them as `.extern .global ...`.
+// CUFAddConstructor + the runtime then map the device-side extern to the
+// host pointer via __cudaRegisterHostVar.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+ return
+ }
+}
+
+// Host-side definition is preserved.
+// UNIFIED: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+// GPU-module clone is an external declaration (no init body, no `dense<...>`).
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// UNIFIED-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// -----
+
+// Globals with an explicit CUF data attribute (device, managed, constant)
+// keep their existing definition-clone path even with cuda-unified=true.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+ fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEdev) : !fir.ref<!fir.array<3xi32>>
+ return
+ }
+}
+
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
diff --git a/flang/test/Fir/CUDA/cuda-unified-module-global.f90 b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
new file mode 100644
index 0000000000000..360401ac6c523
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
@@ -0,0 +1,50 @@
+// End-to-end check that under -gpu=mem:unified, a plain host module-scope
+// variable referenced from a global kernel (issue 2573):
+// 1. is mirrored into the GPU module by CUFDeviceGlobal so nvlink can
+// resolve the device-side reference; and
+// 2. is registered with the CUDA driver via
+// _FortranACUFRegisterExternalVariable (= __cudaRegisterHostVar) from
+// __cudaFortranConstructor, so the device-side symbol is mapped to the
+// host-resident storage at module-load time and HMM/ATS handles
+// migration -- avoiding the bug where host and device kept separate
+// copies that never synced.
+//
+// Pipeline: cuf-device-global with cuda-unified=true (clones the host
+// global into the GPU module as an external declaration), then
+// cuf-add-constructor with cuda-unified=true (emits the registration call
+// for the cloned global).
+
+// RUN: fir-opt --cuf-device-global="cuda-unified=true" --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+ return
+ }
+
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ }
+}
+
+// Host-side definition is preserved.
+// CHECK: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+// GPU module gets an external declaration (no body, no init). PTX lowers
+// it as `.extern .global ...`; nvlink permits the extern because acclnk
+// is invoked with -unifiedmem -init=unified -cudalink. The constructor
+// below registers the host pointer via the CUDA driver.
+// CHECK: gpu.module @cuda_device_mod
+// CHECK: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// CHECK-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// Constructor registers the host pointer.
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// CHECK: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// CHECK: fir.call @_FortranACUFRegisterExternalVariable
+// CHECK-NOT: fir.call @_FortranACUFInitModule
>From ae174242e0fb0299d25cf95c9b5f30146d621040 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 6 May 2026 20:35:01 -0700
Subject: [PATCH 2/2] update test
---
flang/test/Fir/CUDA/cuda-unified-module-global.f90 | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/flang/test/Fir/CUDA/cuda-unified-module-global.f90 b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
index 360401ac6c523..fc34c9c4686e6 100644
--- a/flang/test/Fir/CUDA/cuda-unified-module-global.f90
+++ b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
@@ -1,13 +1,12 @@
// End-to-end check that under -gpu=mem:unified, a plain host module-scope
-// variable referenced from a global kernel (issue 2573):
-// 1. is mirrored into the GPU module by CUFDeviceGlobal so nvlink can
-// resolve the device-side reference; and
+// variable referenced from a global kernel:
+// 1. is mirrored into the GPU module by CUFDeviceGlobal as a no-body
+// external declaration (so PTX gets `.extern .global ...`); and
// 2. is registered with the CUDA driver via
// _FortranACUFRegisterExternalVariable (= __cudaRegisterHostVar) from
-// __cudaFortranConstructor, so the device-side symbol is mapped to the
-// host-resident storage at module-load time and HMM/ATS handles
-// migration -- avoiding the bug where host and device kept separate
-// copies that never synced.
+// __cudaFortranConstructor, so the device-side symbol is mapped to
+// the host-resident storage at module-load time and HMM/ATS handles
+// migration.
//
// Pipeline: cuf-device-global with cuda-unified=true (clones the host
// global into the GPU module as an external declaration), then
More information about the flang-commits
mailing list