[flang-commits] [flang] [flang][cuda] Place box value kernel args in managed memory (PR #197116)
via flang-commits
flang-commits at lists.llvm.org
Tue May 12 01:09:33 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: khaki3
<details>
<summary>Changes</summary>
Example:
```fortran
type deviceArray
integer, allocatable, dimension(:,:), device :: Arr
end type deviceArray
type(deviceArray), allocatable, dimension(:) :: DA
allocate(DA(2))
allocate(DA(1)%Arr(32,32))
call mykernel<<<1,32>>>(DA(1)%Arr, 32) ! cudaErrorIllegalAddress
```
In this code, `DA(1)%Arr` is a device allocatable component inside a managed derived type. The compiler loads the descriptor, reboxes it on the host stack, and passes it to `cuf.kernel_launch`. Since `!fir.box` is lowered to a pointer in LLVM IR, the kernel receives a host-stack pointer it cannot dereference — causing `cudaErrorIllegalAddress`. The existing code only handled descriptors from global device variables.
**Fix:** For `!fir.box` value arguments in `CUFLaunchOpConversion`, allocate a managed descriptor via `_FortranACUFAllocDescriptor`, store the host box into it, pass the managed ref to the kernel, and free it after the launch.
---
Full diff: https://github.com/llvm/llvm-project/pull/197116.diff
2 Files Affected:
- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp (+58-4)
- (modified) flang/test/Fir/CUDA/cuda-launch.fir (+31)
``````````diff
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
index b756b07457b74..03bda4b502b07 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
@@ -369,8 +369,11 @@ struct CUFLaunchOpConversion
using OpRewritePattern::OpRewritePattern;
CUFLaunchOpConversion(mlir::MLIRContext *context,
- const mlir::SymbolTable &symTab)
- : OpRewritePattern(context), symTab{symTab} {}
+ const mlir::SymbolTable &symTab,
+ const mlir::DataLayout *dl,
+ const fir::LLVMTypeConverter *converter)
+ : OpRewritePattern(context), symTab{symTab}, dl{dl},
+ converter{converter} {}
mlir::LogicalResult
matchAndRewrite(cuf::KernelLaunchOp op,
@@ -413,7 +416,10 @@ struct CUFLaunchOpConversion
procAttr =
funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
}
+ auto mod = op->getParentOfType<mlir::ModuleOp>();
+ fir::FirOpBuilder builder(rewriter, mod);
llvm::SmallVector<mlir::Value> args;
+ llvm::SmallVector<mlir::Value> tempDescriptors;
for (mlir::Value arg : op.getArgs()) {
// If the argument is a global descriptor, make sure we pass the device
// copy of this descriptor and not the host one.
@@ -433,6 +439,35 @@ struct CUFLaunchOpConversion
}
}
}
+ // Box value arguments need to be placed in managed memory so
+ // the GPU kernel can access the descriptor. !fir.box is lowered
+ // to a pointer, so passing a host-stack box would give the
+ // kernel a host pointer it cannot dereference.
+ if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(arg.getType())) {
+ auto refTy = fir::ReferenceType::get(boxTy);
+ mlir::func::FuncOp allocFunc =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDescriptor)>(
+ loc, builder);
+ auto allocFTy = allocFunc.getFunctionType();
+ mlir::Value sourceFile =
+ fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine = fir::factory::locationToLineNo(
+ builder, loc, allocFTy.getInput(2));
+ mlir::Type structTy = converter->convertBoxTypeAsStruct(boxTy);
+ std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+ mlir::Value sizeInBytes = builder.createIntegerConstant(
+ loc, builder.getIndexType(), boxSize);
+ llvm::SmallVector<mlir::Value> allocArgs{
+ fir::runtime::createArguments(builder, loc, allocFTy, sizeInBytes,
+ sourceFile, sourceLine)};
+ auto allocCall =
+ fir::CallOp::create(builder, loc, allocFunc, allocArgs);
+ auto managedRef =
+ builder.createConvert(loc, refTy, allocCall.getResult(0));
+ fir::StoreOp::create(builder, loc, arg, managedRef);
+ tempDescriptors.push_back(managedRef);
+ arg = managedRef;
+ }
}
args.push_back(arg);
}
@@ -461,12 +496,30 @@ struct CUFLaunchOpConversion
gpuLaunchOp->setAttr(cuf::getProcAttrName(),
cuf::ProcAttributeAttr::get(
op.getContext(), cuf::ProcAttribute::Global));
- rewriter.replaceOp(op, gpuLaunchOp);
+ // Free temporary managed descriptors allocated for box value arguments.
+ if (!tempDescriptors.empty()) {
+ rewriter.setInsertionPointAfter(gpuLaunchOp);
+ for (mlir::Value desc : tempDescriptors) {
+ mlir::func::FuncOp freeFunc =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDescriptor)>(loc,
+ builder);
+ auto freeFTy = freeFunc.getFunctionType();
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, freeFTy.getInput(2));
+ llvm::SmallVector<mlir::Value> freeArgs{fir::runtime::createArguments(
+ builder, loc, freeFTy, desc, sourceFile, sourceLine)};
+ fir::CallOp::create(builder, loc, freeFunc, freeArgs);
+ }
+ }
+ rewriter.eraseOp(op);
return mlir::success();
}
private:
const mlir::SymbolTable &symTab;
+ const mlir::DataLayout *dl;
+ const fir::LLVMTypeConverter *converter;
};
struct CUFSyncDescriptorOpConversion
@@ -560,7 +613,8 @@ void cuf::populateCUFToFIRConversionPatterns(
patterns.insert<CUFSyncDescriptorOpConversion>(patterns.getContext());
patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
&dl, &converter);
- patterns.insert<CUFLaunchOpConversion>(patterns.getContext(), symtab);
+ patterns.insert<CUFLaunchOpConversion>(patterns.getContext(), symtab, &dl,
+ &converter);
}
void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab,
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 92db6ecaadc45..8269df9befdce 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -156,3 +156,34 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
+
+// -----
+
+// Test that box value arguments to kernel launches are placed in managed memory.
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMtestmePmykernel(%arg0: !fir.ref<!fir.box<!fir.array<?x?xi32>>>, %arg1: i32) kernel {
+ gpu.return
+ }
+ }
+ func.func @_QMtestmePmykernel(%arg0: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: i32) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ return
+ }
+ func.func @_QQmain() {
+ %c1_i32 = arith.constant 1 : i32
+ %c32_i32 = arith.constant 32 : i32
+ %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
+ %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
+ %2 = fir.rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
+ cuf.kernel_launch @_QMtestmePmykernel<<<%c1_i32, %c1_i32, %c1_i32, %c32_i32, %c1_i32, %c1_i32>>>(%2, %c32_i32) : (!fir.box<!fir.array<?x?xi32>>, i32)
+ return
+ }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: %[[BOX:.*]] = fir.rebox
+// CHECK: %[[ALLOC_DESC:.*]] = fir.call @_FortranACUFAllocDescriptor(
+// CHECK: %[[MANAGED_REF:.*]] = fir.convert %[[ALLOC_DESC]]
+// CHECK: fir.store %[[BOX]] to %[[MANAGED_REF]]
+// CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel {{.*}} args(%[[MANAGED_REF]] : !fir.ref<!fir.box<!fir.array<?x?xi32>>>, %{{.*}} : i32)
+// CHECK: fir.call @_FortranACUFFreeDescriptor(
``````````
</details>
https://github.com/llvm/llvm-project/pull/197116
More information about the flang-commits
mailing list