[flang-commits] [flang] 38c36f9 - [flang][CUDA] Support module-scope variables in `global` kernels under `-gpu=mem:unified` (#196228)
via flang-commits
flang-commits at lists.llvm.org
Thu May 7 10:58:17 PDT 2026
Author: Zhen Wang
Date: 2026-05-07T17:58:12Z
New Revision: 38c36f9c8269661bae13bfbe006d78622897e645
URL: https://github.com/llvm/llvm-project/commit/38c36f9c8269661bae13bfbe006d78622897e645
DIFF: https://github.com/llvm/llvm-project/commit/38c36f9c8269661bae13bfbe006d78622897e645.diff
LOG: [flang][CUDA] Support module-scope variables in `global` kernels under `-gpu=mem:unified` (#196228)
Under `-gpu=mem:unified`, plain Fortran module-scope variables
referenced directly from a `global` kernel previously produced wrong
results. This adds a `cuda-unified` option to the CUF passes:
- CUFDeviceGlobal: when set, plain (un-attributed, non-constant) module
globals are mirrored into the GPU module as no-body declarations, so PTX
emits `.extern .global ...`.
- CUFAddConstructor: when set, emits a CUFRegisterExternalVariable call
for each such global from `__cudaFortranConstructor`.
- New runtime entry `CUFRegisterExternalVariable` wraps
`__cudaRegisterHostVar` so the CUDA driver maps the device extern to the
host pointer at module-load time. HMM/ATS handles migration from there.
Added:
flang/test/Fir/CUDA/cuda-unified-module-global.f90
Modified:
flang-rt/lib/cuda/registration.cpp
flang/include/flang/Optimizer/Transforms/Passes.td
flang/include/flang/Runtime/CUDA/registration.h
flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
flang/test/Fir/CUDA/cuda-constructor-2.f90
flang/test/Fir/CUDA/cuda-device-global.f90
Removed:
################################################################################
diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp
index 58077d6a6a52b..a35284a2460dc 100644
--- a/flang-rt/lib/cuda/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -27,6 +27,8 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
extern void __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
int ext, size_t size, int constant, int global);
+extern void __cudaRegisterHostVar(
+ void **fatCubinHandle, const char *deviceName, char *hostVar, size_t size);
extern char __cudaInitModule(void **fatCubinHandle);
void *RTDECL(CUFRegisterModule)(void *data) {
@@ -46,6 +48,14 @@ void RTDEF(CUFRegisterVariable)(
__cudaRegisterVar(module, varSym, varName, varName, 0, size, 0, 0);
}
+void RTDEF(CUFRegisterExternalVariable)(
+ void **module, char *varSym, const char *varName, int64_t size) {
+ // Tell the CUDA driver to bind the device-side global <varName> to the
+ // host-resident storage at <varSym>. Kernel accesses to <varName> then go
+ // through the host address; HMM/ATS handles migration.
+ __cudaRegisterHostVar(module, varName, varSym, size);
+}
+
void RTDEF(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size) {
__cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e107672adf907..d9072e7aab4f7 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -517,7 +517,12 @@ def CUFDeviceGlobal :
let options = [
Option<"skipDeadDeclares", "skip-dead-declares", "bool",
/*default=*/"true",
- "Skip globals whose only use is a dead fir.declare">
+ "Skip globals whose only use is a dead fir.declare">,
+ Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+ "Treat host module globals as unified memory (-gpu=mem:unified): "
+ "clone them into the GPU module as external declarations so PTX "
+ "emits `.extern .global ...`. The CUDA runtime maps the device "
+ "extern to the host pointer via __cudaRegisterHostVar.">
];
}
@@ -526,6 +531,14 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
let dependentDialects = [
"cuf::CUFDialect", "mlir::func::FuncDialect", "mlir::DLTIDialect"
];
+ let options = [
+ Option<"cudaUnified", "cuda-unified", "bool", /*default=*/"false",
+ "Treat host module globals as unified memory (-gpu=mem:unified): "
+ "register every plain host module global that is mirrored in the "
+ "GPU module via CUFRegisterExternalVariable, so the CUDA driver "
+ "maps the device-side symbol to the host pointer at module-load "
+ "time and HMM/ATS handles migration.">
+ ];
}
def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 74dbf9e189076..48a74dc1d6d03 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -28,6 +28,13 @@ void RTDECL(CUFRegisterFunction)(
void RTDECL(CUFRegisterVariable)(
void **module, char *varSym, const char *varName, int64_t size);
+/// Register a module-scope variable as host-resident under -gpu=mem:unified,
+/// so that the device-side symbol of the same name is mapped to the host
+/// pointer at module-load time. Wraps __cudaRegisterHostVar. Kernel accesses
+/// to the variable then reach the host storage directly via HMM/ATS.
+void RTDECL(CUFRegisterExternalVariable)(
+ void **module, char *varSym, const char *varName, int64_t size);
+
/// Register a managed variable.
void RTDECL(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size);
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index 248905fdc70d5..466d65d16aadf 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -20,6 +20,7 @@
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/DLTI/DLTI.h"
@@ -76,12 +77,89 @@ static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
return ptrGlobal;
}
+/// Return true if \p hostGlobal is a host module-scope global that has been
+/// mirrored in the GPU module as an external (no-body) declaration by the
+/// CUFDeviceGlobal pass under -gpu=mem:unified. Such globals must be
+/// registered with the CUDA driver via CUFRegisterExternalVariable so the
+/// device-side `.extern` symbol resolves to the host pointer at module-load
+/// time and HMM/ATS handles migration.
+static bool isCudaUnifiedExternalGlobal(fir::GlobalOp hostGlobal,
+ mlir::SymbolTable &gpuSymTable) {
+ if (hostGlobal.getDataAttrAttr())
+ return false;
+ if (hostGlobal.getConstant())
+ return false;
+ auto gpuGlobal = gpuSymTable.lookup<fir::GlobalOp>(hostGlobal.getSymName());
+ if (!gpuGlobal)
+ return false;
+ return !gpuGlobal.isInitialized();
+}
+
+/// Build a C-style name literal (`<symname>\0`) for use as the deviceName
+/// argument of a CUF registration runtime call.
+static mlir::Value buildGlobalNameLiteral(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ fir::GlobalOp globalOp) {
+ std::string nameStr = globalOp.getSymbol().getValue().str();
+ nameStr += '\0';
+ return fir::getBase(fir::factory::createStringLiteral(builder, loc, nameStr));
+}
+
+/// Compute the storage size in bytes of \p globalOp. For a box-typed
+/// allocatable global the size is the descriptor size (after type
+/// conversion); otherwise it's the size of the global's declared type.
+static mlir::Value computeGlobalSize(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Type idxTy,
+ const mlir::DataLayout &dl,
+ const fir::KindMapping &kindMap,
+ fir::LLVMTypeConverter &typeConverter,
+ fir::GlobalOp globalOp) {
+ std::optional<uint64_t> size;
+ if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
+ mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
+ size = dl.getTypeSizeInBits(structTy) / 8;
+ }
+ if (!size) {
+ size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(), dl,
+ kindMap)
+ .first;
+ }
+ return builder.createIntegerConstant(loc, idxTy, *size);
+}
+
+/// Emit a call to a CUF registration runtime function with the canonical
+/// (module, addr, name, size) signature, where addr is the address of \p
+/// addrGlobal taken via fir.address_of and name/size describe \p nameGlobal.
+/// Used both for CUFRegisterVariable / CUFRegisterManagedVariable / and
+/// CUFRegisterExternalVariable.
+static void
+emitCUFRegistrationCall(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Type idxTy, const mlir::DataLayout &dl,
+ const fir::KindMapping &kindMap,
+ fir::LLVMTypeConverter &typeConverter,
+ mlir::Value registeredMod, mlir::func::FuncOp func,
+ fir::GlobalOp addrGlobal, fir::GlobalOp nameGlobal) {
+ mlir::Value gblName = buildGlobalNameLiteral(builder, loc, nameGlobal);
+ mlir::Value sizeVal = computeGlobalSize(builder, loc, idxTy, dl, kindMap,
+ typeConverter, nameGlobal);
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, addrGlobal.resultType(), addrGlobal.getSymbol());
+ llvm::SmallVector<mlir::Value> args{
+ fir::runtime::createArguments(builder, loc, func.getFunctionType(),
+ registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+}
+
static bool hasRegisteredGlobals(mlir::ModuleOp mod,
- mlir::SymbolTable gpuSymTable) {
+ mlir::SymbolTable gpuSymTable,
+ bool cudaUnified) {
for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
auto attr = globalOp.getDataAttrAttr();
- if (!attr)
+ if (!attr) {
+ if (cudaUnified && isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+ return true;
continue;
+ }
if (!gpuSymTable.lookup(globalOp.getSymName()))
continue;
if (attr.getValue() == cuf::DataAttribute::Managed &&
@@ -110,6 +188,8 @@ static bool hasKernel(mlir::gpu::GPUModuleOp gpuMod) {
struct CUFAddConstructor
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
+ using CUFAddConstructorBase::CUFAddConstructorBase;
+
void runOnOperation() override {
mlir::ModuleOp mod = getOperation();
mlir::SymbolTable symTab(mod);
@@ -151,7 +231,8 @@ struct CUFAddConstructor
if (gpuMod) {
mlir::SymbolTable gpuSymTable(gpuMod);
bool needsModuleRegistration =
- hasKernel(gpuMod) || hasRegisteredGlobals(mod, gpuSymTable);
+ hasKernel(gpuMod) ||
+ hasRegisteredGlobals(mod, gpuSymTable, cudaUnified);
if (needsModuleRegistration) {
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
auto registeredMod = cuf::RegisterModuleOp::create(
@@ -185,31 +266,10 @@ struct CUFAddConstructor
attr.getValue() == cuf::DataAttribute::Managed &&
!mlir::isa<fir::BaseBoxType>(globalOp.getType());
- mlir::func::FuncOp func;
switch (attr.getValue()) {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant:
case cuf::DataAttribute::Managed: {
- // Global variable name
- std::string gblNameStr = globalOp.getSymbol().getValue().str();
- gblNameStr += '\0';
- mlir::Value gblName = fir::getBase(
- fir::factory::createStringLiteral(builder, loc, gblNameStr));
-
- // Global variable size
- std::optional<uint64_t> size;
- if (auto boxTy =
- mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
- mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
- size = dl->getTypeSizeInBits(structTy) / 8;
- }
- if (!size) {
- size = fir::getTypeSizeAndAlignmentOrCrash(
- loc, globalOp.getType(), *dl, kindMap)
- .first;
- }
- auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
-
if (isNonAllocManagedGlobal) {
hasNonAllocManagedGlobal = true;
// Non-allocatable managed globals use pointer indirection:
@@ -217,23 +277,20 @@ struct CUFAddConstructor
// memory address, registered via __cudaRegisterManagedVar.
fir::GlobalOp ptrGlobal =
createManagedPointerGlobal(builder, mod, globalOp);
- func = fir::runtime::getRuntimeFunc<mkRTKey(
+ auto func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFRegisterManagedVariable)>(loc, builder);
- auto fTy = func.getFunctionType();
- mlir::Value addr = fir::AddrOfOp::create(
- builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
- fir::CallOp::create(builder, loc, func, args);
+ emitCUFRegistrationCall(builder, loc, idxTy, *dl, kindMap,
+ typeConverter, registeredMod, func,
+ /*addrGlobal=*/ptrGlobal,
+ /*nameGlobal=*/globalOp);
} else {
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
- loc, builder);
- auto fTy = func.getFunctionType();
- mlir::Value addr = fir::AddrOfOp::create(
- builder, loc, globalOp.resultType(), globalOp.getSymbol());
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
- fir::CallOp::create(builder, loc, func, args);
+ auto func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
+ loc, builder);
+ emitCUFRegistrationCall(builder, loc, idxTy, *dl, kindMap,
+ typeConverter, registeredMod, func,
+ /*addrGlobal=*/globalOp,
+ /*nameGlobal=*/globalOp);
}
} break;
default:
@@ -241,6 +298,25 @@ struct CUFAddConstructor
}
}
+ // Register externally-linked module globals under -gpu=mem:unified.
+ // CUFDeviceGlobal cloned them into the GPU module with external
+ // linkage so PTX emits .extern; the CUDA driver patches the device
+ // reference to the host pointer at module-load time after this call.
+ // Works uniformly for fixed-shape (e.g. fir.array<5xi32>) and
+ // allocatable (fir.box<fir.heap<...>>) module globals.
+ if (cudaUnified) {
+ for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
+ if (!isCudaUnifiedExternalGlobal(globalOp, gpuSymTable))
+ continue;
+ auto func = fir::runtime::getRuntimeFunc<mkRTKey(
+ CUFRegisterExternalVariable)>(loc, builder);
+ emitCUFRegistrationCall(builder, loc, idxTy, *dl, kindMap,
+ typeConverter, registeredMod, func,
+ /*addrGlobal=*/globalOp,
+ /*nameGlobal=*/globalOp);
+ }
+ }
+
if (hasNonAllocManagedGlobal) {
// Initialize the module after all variables are registered so the
// runtime populates managed variable unified memory pointers.
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
index 9970197627f29..326051327f86a 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceGlobal.cpp
@@ -165,7 +165,25 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
continue;
}
- gpuSymTable.insert(globalOp->clone());
+ auto *cloned = globalOp->clone();
+ // Under -gpu=mem:unified, plain host module-scope variables (no
+ // explicit CUF data attribute, not a constant) get a no-body
+ // declaration in the GPU module: clear the body, init value, and
+ // linkName. With no linkName, the LLVM lowering uses the default
+ // External linkage (see convertLinkage in CodeGen.cpp), so an
+ // initializer-less global emits as `.extern .global ...` in PTX.
+ // The host-side definition stays. CUFAddConstructor will emit
+ // CUFRegisterExternalVariable (= __cudaRegisterHostVar) so the CUDA
+ // runtime maps the device extern to the host pointer at module-load
+ // time, and HMM/ATS handles migration.
+ if (cudaUnified && !globalOp.getConstant() &&
+ !globalOp.getDataAttrAttr()) {
+ auto clonedGlobal = mlir::cast<fir::GlobalOp>(cloned);
+ clonedGlobal.getRegion().getBlocks().clear();
+ clonedGlobal.removeInitValAttr();
+ clonedGlobal.removeLinkNameAttr();
+ }
+ gpuSymTable.insert(cloned);
}
}
};
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 452b89bea6b80..bb2a98c294558 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -1,4 +1,5 @@
// RUN: fir-opt --split-input-file --cuf-add-constructor %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s --check-prefixes=CHECK,UNIFIED
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
@@ -201,3 +202,58 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr
// CHECK: fir.address_of(@_QMkernels_mEdev_var) : !fir.ref<f32>
// CHECK: fir.call @_FortranACUFRegisterVariable(%3, %4, %5, %6) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> ()
+
+// -----
+
+// Under -gpu=mem:unified, a non-allocatable host module global referenced from
+// device code (mirrored as an external-linkage clone in the GPU module by the
+// CUFDeviceGlobal pass) must be registered with
+// _FortranACUFRegisterExternalVariable so the device-side `.extern` symbol
+// resolves to the host pointer at module-load time.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ // External declaration: no body, no init value -- lowers to PTX
+ // `.extern .global ...`. CUFAddConstructor must register it so the
+ // CUDA runtime maps it to the host pointer at module-load time.
+ fir.global @_QMmtestsEm : !fir.array<5xi32>
+ }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
+
+// -----
+
+// Under -gpu=mem:unified, an allocatable host module global also gets
+// registered via _FortranACUFRegisterExternalVariable; the registered symbol
+// is the descriptor (fir.box<fir.heap<...>>). The host runtime allocates the
+// data buffer in HMM/ATS-accessible memory.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?xi32>>
+ %1 = fircg.ext_embox %0(%c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xi32>>, index) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?xi32>>>
+ }
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ fir.global @_QMmtestsEma : !fir.box<!fir.heap<!fir.array<?xi32>>>
+ }
+}
+
+// UNIFIED: llvm.func internal @__cudaFortranConstructor()
+// UNIFIED: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// UNIFIED: fir.address_of(@_QMmtestsEma) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// UNIFIED: fir.call @_FortranACUFRegisterExternalVariable
+// UNIFIED-NOT: fir.call @_FortranACUFInitModule
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index a35ab6d2ebc95..4f4dd222d9eae 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -1,5 +1,6 @@
// RUN: fir-opt --split-input-file --cuf-device-global %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-device-global="cuda-unified=true" %s | FileCheck %s --check-prefix=UNIFIED
module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
@@ -84,3 +85,42 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta
// CHECK-DAG: fir.global @_QMmEa
// CHECK-DAG: fir.global @_QMmEb
// CHECK-DAG: fir.global @_QMmEc
+
+// -----
+
+// Under -gpu=mem:unified (cuda-unified=true), plain host module-scope
+// variables referenced from device code are mirrored as no-body external
+// declarations in the GPU module. PTX lowers them as `.extern .global ...`.
+// CUFAddConstructor + the runtime then map the device-side extern to the
+// host pointer via __cudaRegisterHostVar.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+ return
+ }
+}
+
+// Host-side definition is preserved.
+// UNIFIED: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+// GPU-module clone is an external declaration (no init body, no `dense<...>`).
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// UNIFIED-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// -----
+
+// Globals with an explicit CUF data attribute (device, managed, constant)
+// keep their existing definition-clone path even with cuda-unified=true.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+ fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEdev) : !fir.ref<!fir.array<3xi32>>
+ return
+ }
+}
+
+// UNIFIED: gpu.module @cuda_device_mod
+// UNIFIED: fir.global @_QMmtestsEdev(dense<[1, 2, 3]> : tensor<3xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<3xi32>
diff --git a/flang/test/Fir/CUDA/cuda-unified-module-global.f90 b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
new file mode 100644
index 0000000000000..fc34c9c4686e6
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-unified-module-global.f90
@@ -0,0 +1,49 @@
+// End-to-end check that under -gpu=mem:unified, a plain host module-scope
+// variable referenced from a global kernel:
+// 1. is mirrored into the GPU module by CUFDeviceGlobal as a no-body
+// external declaration (so PTX gets `.extern .global ...`); and
+// 2. is registered with the CUDA driver via
+// _FortranACUFRegisterExternalVariable (= __cudaRegisterHostVar) from
+// __cudaFortranConstructor, so the device-side symbol is mapped to
+// the host-resident storage at module-load time and HMM/ATS handles
+// migration.
+//
+// Pipeline: cuf-device-global with cuda-unified=true (clones the host
+// global into the GPU module as an external declaration), then
+// cuf-add-constructor with cuda-unified=true (emits the registration call
+// for the cloned global).
+
+// RUN: fir-opt --cuf-device-global="cuda-unified=true" --cuf-add-constructor="cuda-unified=true" %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+ func.func @_QMmtestsPg1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+ return
+ }
+
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmtestsPg1() kernel {
+ gpu.return
+ }
+ }
+}
+
+// Host-side definition is preserved.
+// CHECK: fir.global @_QMmtestsEm(dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : !fir.array<5xi32>
+
+// GPU module gets an external declaration (no body, no init). PTX lowers
+// it as `.extern .global ...`; nvlink permits the extern because acclnk
+// is invoked with -unifiedmem -init=unified -cudalink. The constructor
+// below registers the host pointer via the CUDA driver.
+// CHECK: gpu.module @cuda_device_mod
+// CHECK: fir.global @_QMmtestsEm : !fir.array<5xi32>
+// CHECK-NOT: fir.global @_QMmtestsEm{{.*}}dense
+
+// Constructor registers the host pointer.
+// CHECK: llvm.func internal @__cudaFortranConstructor()
+// CHECK: cuf.register_module @cuda_device_mod -> !llvm.ptr
+// CHECK: fir.address_of(@_QMmtestsEm) : !fir.ref<!fir.array<5xi32>>
+// CHECK: fir.call @_FortranACUFRegisterExternalVariable
+// CHECK-NOT: fir.call @_FortranACUFInitModule
More information about the flang-commits
mailing list