[flang-commits] [flang] [flang][cuda] Don't allocate managed descriptors for OpenACC data-clause boxes (PR #201957)
via flang-commits
flang-commits at lists.llvm.org
Fri Jun 5 15:50:25 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: khaki3
<details>
<summary>Changes</summary>
Example:
```fortran
module m
real(8), allocatable :: u(:,:,:,:) ! device-resident allocatable
contains
subroutine core(uf, n)
integer :: n
real(8) :: uf(n,n,n,5)
!$acc kernels loop present(uf) ! descriptor built for uf
...
end subroutine
subroutine wrap(n)
call core(u, n)
end subroutine
end module
```
In this code, the descriptor for `present(uf)` is only used by `acc` data-clause ops (the kernel gets the clause result, not the box), so it never needs to be device-accessible. But `isDeviceAllocation()` is true, so it's allocated via `_FortranACUFAllocDescriptor` in managed memory and never freed — leaking it and leaving stale descriptors when the address is reused.
Fix: allocate a descriptor in managed memory only when truly device-accessible — used by a CUDA Fortran kernel launch (`gpu.launch_func` with `cuf.proc_attr`), or device-allocated and not consumed by an `acc` data-clause op. Rename `isUsedByGPULaunchFunc` → `isUsedByCUFKernelLaunch` and add `isUsedByOpenACCDataClause`.
---
Full diff: https://github.com/llvm/llvm-project/pull/201957.diff
2 Files Affected:
- (modified) flang/lib/Optimizer/CodeGen/CodeGen.cpp (+37-9)
- (modified) flang/test/Fir/CUDA/cuda-code-gen.mlir (+43-3)
``````````diff
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b1acba393170..cb77cde48fd55 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2099,11 +2099,37 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
}
};
-/// Return true if any user of \p val is a gpu.launch_func operation,
-/// indicating the descriptor must be in device-accessible memory.
-static bool isUsedByGPULaunchFunc(mlir::Value val) {
+/// Return true if any user of \p val is a CUDA Fortran kernel launch
+/// (a gpu.launch_func carrying the `cuf.proc_attr` attribute set by
+/// CUFLaunchAttachAttr). For such launches the descriptor is passed by
+/// reference to the device kernel, so its storage must be device-accessible
+/// (managed) memory.
+///
+/// OpenACC kernel launches are also represented as gpu.launch_func at this
+/// point, but their box arguments are handled through the OpenACC data-clause
+/// lowering and do not require a device-accessible descriptor; forcing those
+/// into managed memory is unnecessary and leaks the descriptor (it is never
+/// freed). Hence only CUDA Fortran launches are considered here.
+static bool isUsedByCUFKernelLaunch(mlir::Value val) {
+ for (auto *user : val.getUsers())
+ if (auto launch = mlir::dyn_cast<mlir::gpu::LaunchFuncOp>(user))
+ if (launch->hasAttr(cuf::getProcAttrName()))
+ return true;
+ return false;
+}
+
+/// Return true if any user of \p val is an OpenACC data-clause operation (an op
+/// from the `acc` dialect, e.g. `acc.present`, `acc.copyin`, `acc.create`).
+///
+/// Such a box is host-side descriptor metadata for the OpenACC data clause: the
+/// data-clause result carries the device data, so the box itself does not need
+/// to be device-accessible. It must therefore NOT be placed in managed memory
+/// even when the data it describes is device-resident: a managed descriptor
+/// created here is never freed and leaves a stale descriptor behind when its
+/// address is later reused.
+static bool isUsedByOpenACCDataClause(mlir::Value val) {
for (auto *user : val.getUsers())
- if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
+ if (mlir::isa_and_nonnull<mlir::acc::OpenACCDialect>(user->getDialect()))
return true;
return false;
}
@@ -2350,8 +2376,9 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
if (fir::isDerivedTypeWithLenParams(boxTy))
TODO(loc, "fir.embox codegen of derived with length parameters");
bool needsDeviceAlloc =
- isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) ||
- isUsedByGPULaunchFunc(xbox);
+ isUsedByCUFKernelLaunch(xbox) ||
+ (isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) &&
+ !isUsedByOpenACCDataClause(xbox));
mlir::Value result = placeInMemoryIfNotGlobalInit(rewriter, loc, boxTy,
dest, needsDeviceAlloc);
rewriter.replaceOp(xbox, result);
@@ -2470,8 +2497,9 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
}
dest = insertBaseAddress(rewriter, loc, dest, base);
bool needsDeviceAlloc =
- isDeviceAllocation(rebox.getBox(), adaptor.getBox()) ||
- isUsedByGPULaunchFunc(rebox);
+ isUsedByCUFKernelLaunch(rebox) ||
+ (isDeviceAllocation(rebox.getBox(), adaptor.getBox()) &&
+ !isUsedByOpenACCDataClause(rebox));
mlir::Value result = placeInMemoryIfNotGlobalInit(
rewriter, rebox.getLoc(), destBoxTy, dest, needsDeviceAlloc);
rewriter.replaceOp(rebox, result);
@@ -3641,7 +3669,7 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
}
}
- if (!newBoxStorage && isUsedByGPULaunchFunc(load)) {
+ if (!newBoxStorage && isUsedByCUFKernelLaunch(load)) {
auto mod = load->getParentOfType<mlir::ModuleOp>();
newBoxStorage =
genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index cdaa775b5b49c..cb4512676c306 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -402,8 +402,9 @@ module {
// -----
-// Test that a rebox whose result is passed to gpu.launch_func gets a managed
-// descriptor so the GPU kernel can access it.
+// Test that a rebox whose result is passed to a CUDA Fortran kernel launch
+// (gpu.launch_func carrying cuf.proc_attr) gets a managed descriptor so the GPU
+// kernel can access it.
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} {
gpu.module @cuda_device_mod {
@@ -419,7 +420,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
%0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
%1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
%2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
- gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32)
+ gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32) {cuf.proc_attr = #cuf.cuda_proc<global>}
return
}
}
@@ -427,3 +428,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: llvm.call @_FortranACUFAllocDescriptor(
// CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel
+
+// -----
+
+// A rebox of a device-resident global (isDeviceAllocation == true) whose result
+// is consumed by an OpenACC data-clause op (here acc.present) must NOT be placed
+// in managed memory. The box is host-side descriptor metadata for the data
+// clause; a managed descriptor created here is never freed and leaves a stale
+// descriptor behind when its address is reused. The descriptor storage should
+// therefore be a plain stack alloca and no _FortranACUFAllocDescriptor call
+// should be emitted for it.
+
+module attributes {gpu.container_module} {
+ fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+ %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ }
+ func.func @_QQmain() {
+ %0 = fir.address_of(@_QMm1Eda) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
+ %3 = acc.present var(%2 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "uf"}
+ acc.delete accVar(%3 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_present>, name = "uf"}
+ return
+ }
+ gpu.module @cuda_device_mod {
+ fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+ %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ }
+ }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: llvm.alloca {{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor
``````````
</details>
https://github.com/llvm/llvm-project/pull/201957
More information about the flang-commits
mailing list