[flang-commits] [flang] [flang][cuda] Fix unregistered allocator (PR #195924)
via flang-commits
flang-commits at lists.llvm.org
Tue May 5 13:08:41 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
#<!-- -->194290 changed how we register the constructor and made an early return which then miss to add the constructor to `llvm.mlir.global_ctors` which leads to runtime failure because the allocators for CUDA Fortran are not registered.
---
Full diff: https://github.com/llvm/llvm-project/pull/195924.diff
2 Files Affected:
- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp (+92-94)
- (modified) flang/test/Fir/CUDA/cuda-constructor-2.f90 (+3-1)
``````````diff
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
index af6019651db81..248905fdc70d5 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
@@ -150,110 +150,108 @@ struct CUFAddConstructor
auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName);
if (gpuMod) {
mlir::SymbolTable gpuSymTable(gpuMod);
- if (!hasKernel(gpuMod) && !hasRegisteredGlobals(mod, gpuSymTable)) {
- // No kernels and no globals to register means no GPU binary to
- // register. This happens for host TUs that USE a kernel module but
- // don't define any device code.
- mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
- return;
- }
-
- auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
- auto registeredMod = cuf::RegisterModuleOp::create(
- builder, loc, llvmPtrTy,
- mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));
+ bool needsModuleRegistration =
+ hasKernel(gpuMod) || hasRegisteredGlobals(mod, gpuSymTable);
+ if (needsModuleRegistration) {
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
+ auto registeredMod = cuf::RegisterModuleOp::create(
+ builder, loc, llvmPtrTy,
+ mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));
- fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false,
- /*forceUnifiedTBAATree=*/false, *dl);
- // Register kernels
- for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
- if (func.isKernel()) {
- auto kernelName = mlir::SymbolRefAttr::get(
- builder.getStringAttr(cudaDeviceModuleName),
- {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())});
- cuf::RegisterKernelOp::create(builder, loc, kernelName,
- registeredMod);
+ fir::LLVMTypeConverter typeConverter(
+ mod, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl);
+ // Register kernels
+ for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
+ if (func.isKernel()) {
+ auto kernelName = mlir::SymbolRefAttr::get(
+ builder.getStringAttr(cudaDeviceModuleName),
+ {mlir::SymbolRefAttr::get(builder.getContext(),
+ func.getName())});
+ cuf::RegisterKernelOp::create(builder, loc, kernelName,
+ registeredMod);
+ }
}
- }
- // Register variables
- bool hasNonAllocManagedGlobal = false;
- for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
- auto attr = globalOp.getDataAttrAttr();
- if (!attr)
- continue;
- if (!gpuSymTable.lookup(globalOp.getSymName()))
- continue;
+ // Register variables
+ bool hasNonAllocManagedGlobal = false;
+ for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
+ auto attr = globalOp.getDataAttrAttr();
+ if (!attr)
+ continue;
+ if (!gpuSymTable.lookup(globalOp.getSymName()))
+ continue;
- bool isNonAllocManagedGlobal =
- attr.getValue() == cuf::DataAttribute::Managed &&
- !mlir::isa<fir::BaseBoxType>(globalOp.getType());
+ bool isNonAllocManagedGlobal =
+ attr.getValue() == cuf::DataAttribute::Managed &&
+ !mlir::isa<fir::BaseBoxType>(globalOp.getType());
- mlir::func::FuncOp func;
- switch (attr.getValue()) {
- case cuf::DataAttribute::Device:
- case cuf::DataAttribute::Constant:
- case cuf::DataAttribute::Managed: {
- // Global variable name
- std::string gblNameStr = globalOp.getSymbol().getValue().str();
- gblNameStr += '\0';
- mlir::Value gblName = fir::getBase(
- fir::factory::createStringLiteral(builder, loc, gblNameStr));
+ mlir::func::FuncOp func;
+ switch (attr.getValue()) {
+ case cuf::DataAttribute::Device:
+ case cuf::DataAttribute::Constant:
+ case cuf::DataAttribute::Managed: {
+ // Global variable name
+ std::string gblNameStr = globalOp.getSymbol().getValue().str();
+ gblNameStr += '\0';
+ mlir::Value gblName = fir::getBase(
+ fir::factory::createStringLiteral(builder, loc, gblNameStr));
- // Global variable size
- std::optional<uint64_t> size;
- if (auto boxTy =
- mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
- mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
- size = dl->getTypeSizeInBits(structTy) / 8;
- }
- if (!size) {
- size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(),
- *dl, kindMap)
- .first;
- }
- auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
+ // Global variable size
+ std::optional<uint64_t> size;
+ if (auto boxTy =
+ mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
+ mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
+ size = dl->getTypeSizeInBits(structTy) / 8;
+ }
+ if (!size) {
+ size = fir::getTypeSizeAndAlignmentOrCrash(
+ loc, globalOp.getType(), *dl, kindMap)
+ .first;
+ }
+ auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
- if (isNonAllocManagedGlobal) {
- hasNonAllocManagedGlobal = true;
- // Non-allocatable managed globals use pointer indirection:
- // a companion pointer in __nv_managed_data__ holds the unified
- // memory address, registered via __cudaRegisterManagedVar.
- fir::GlobalOp ptrGlobal =
- createManagedPointerGlobal(builder, mod, globalOp);
- func = fir::runtime::getRuntimeFunc<mkRTKey(
- CUFRegisterManagedVariable)>(loc, builder);
- auto fTy = func.getFunctionType();
- mlir::Value addr = fir::AddrOfOp::create(
- builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
- fir::CallOp::create(builder, loc, func, args);
- } else {
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
- loc, builder);
- auto fTy = func.getFunctionType();
- mlir::Value addr = fir::AddrOfOp::create(
- builder, loc, globalOp.resultType(), globalOp.getSymbol());
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
- fir::CallOp::create(builder, loc, func, args);
+ if (isNonAllocManagedGlobal) {
+ hasNonAllocManagedGlobal = true;
+ // Non-allocatable managed globals use pointer indirection:
+ // a companion pointer in __nv_managed_data__ holds the unified
+ // memory address, registered via __cudaRegisterManagedVar.
+ fir::GlobalOp ptrGlobal =
+ createManagedPointerGlobal(builder, mod, globalOp);
+ func = fir::runtime::getRuntimeFunc<mkRTKey(
+ CUFRegisterManagedVariable)>(loc, builder);
+ auto fTy = func.getFunctionType();
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+ } else {
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
+ loc, builder);
+ auto fTy = func.getFunctionType();
+ mlir::Value addr = fir::AddrOfOp::create(
+ builder, loc, globalOp.resultType(), globalOp.getSymbol());
+ llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+ builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
+ fir::CallOp::create(builder, loc, func, args);
+ }
+ } break;
+ default:
+ break;
}
- } break;
- default:
- break;
}
- }
- if (hasNonAllocManagedGlobal) {
- // Initialize the module after all variables are registered so the
- // runtime populates managed variable unified memory pointers.
- mlir::func::FuncOp initFunc =
- fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
- mlir::FunctionType initFTy = initFunc.getFunctionType();
- llvm::SmallVector<mlir::Value> initArgs{fir::runtime::createArguments(
- builder, loc, initFTy, registeredMod)};
- fir::CallOp::create(builder, loc, initFunc, initArgs);
+ if (hasNonAllocManagedGlobal) {
+ // Initialize the module after all variables are registered so the
+ // runtime populates managed variable unified memory pointers.
+ mlir::func::FuncOp initFunc =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc,
+ builder);
+ mlir::FunctionType initFTy = initFunc.getFunctionType();
+ llvm::SmallVector<mlir::Value> initArgs{fir::runtime::createArguments(
+ builder, loc, initFTy, registeredMod)};
+ fir::CallOp::create(builder, loc, initFunc, initArgs);
+ }
}
}
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 439a05569520f..452b89bea6b80 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -151,7 +151,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// -----
// Test that when the gpu.module has no kernels (e.g., host TU that USEs
-// a kernel module), the constructor returns early without registering.
+// a kernel module), the constructor registers allocators but skips module
+// registration.
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
@@ -170,6 +171,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: llvm.func internal @__cudaFortranConstructor()
// CHECK-NEXT: llvm.call @_FortranACUFRegisterAllocator()
// CHECK-NEXT: llvm.return
+// CHECK: llvm.mlir.global_ctors ctors = [@__cudaFortranConstructor]
// -----
``````````
</details>
https://github.com/llvm/llvm-project/pull/195924
More information about the flang-commits
mailing list