[flang-commits] [flang] [flang][cuda] Don't allocate managed descriptors for OpenACC data-clause boxes (PR #201957)

Fri Jun 5 15:50:25 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: khaki3

<details>
<summary>Changes</summary>

Example:
```fortran
module m
  real(8), allocatable :: u(:,:,:,:)   ! device-resident allocatable
contains
  subroutine core(uf, n)
    integer :: n
    real(8) :: uf(n,n,n,5)
    !$acc kernels loop present(uf)      ! descriptor built for uf
    ...
  end subroutine
  subroutine wrap(n)
    call core(u, n)
  end subroutine
end module
```

In this code, the descriptor for `present(uf)` is only used by `acc` data-clause ops (the kernel gets the clause result, not the box), so it never needs to be device-accessible. But `isDeviceAllocation()` is true, so it's allocated via `_FortranACUFAllocDescriptor` in managed memory and never freed — leaking it and leaving stale descriptors when the address is reused.

Fix: allocate a descriptor in managed memory only when truly device-accessible — used by a CUDA Fortran kernel launch (`gpu.launch_func` with `cuf.proc_attr`), or device-allocated and not consumed by an `acc` data-clause op. Rename `isUsedByGPULaunchFunc` → `isUsedByCUFKernelLaunch` and add `isUsedByOpenACCDataClause`.

---
Full diff: https://github.com/llvm/llvm-project/pull/201957.diff


2 Files Affected:

- (modified) flang/lib/Optimizer/CodeGen/CodeGen.cpp (+37-9) 
- (modified) flang/test/Fir/CUDA/cuda-code-gen.mlir (+43-3) 


``````````diff

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b1acba393170..cb77cde48fd55 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2099,11 +2099,37 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
   }
 };
 
-/// Return true if any user of \p val is a gpu.launch_func operation,
-/// indicating the descriptor must be in device-accessible memory.
-static bool isUsedByGPULaunchFunc(mlir::Value val) {
+/// Return true if any user of \p val is a CUDA Fortran kernel launch
+/// (a gpu.launch_func carrying the `cuf.proc_attr` attribute set by
+/// CUFLaunchAttachAttr). For such launches the descriptor is passed by
+/// reference to the device kernel, so its storage must be device-accessible
+/// (managed) memory.
+///
+/// OpenACC kernel launches are also represented as gpu.launch_func at this
+/// point, but their box arguments are handled through the OpenACC data-clause
+/// lowering and do not require a device-accessible descriptor; forcing those
+/// into managed memory is unnecessary and leaks the descriptor (it is never
+/// freed). Hence only CUDA Fortran launches are considered here.
+static bool isUsedByCUFKernelLaunch(mlir::Value val) {
+  for (auto *user : val.getUsers())
+    if (auto launch = mlir::dyn_cast<mlir::gpu::LaunchFuncOp>(user))
+      if (launch->hasAttr(cuf::getProcAttrName()))
+        return true;
+  return false;
+}
+
+/// Return true if any user of \p val is an OpenACC data-clause operation (an op
+/// from the `acc` dialect, e.g. `acc.present`, `acc.copyin`, `acc.create`).
+///
+/// Such a box is host-side descriptor metadata for the OpenACC data clause: the
+/// data-clause result carries the device data, so the box itself does not need
+/// to be device-accessible. It must therefore NOT be placed in managed memory
+/// even when the data it describes is device-resident: a managed descriptor
+/// created here is never freed and leaves a stale descriptor behind when its
+/// address is later reused.
+static bool isUsedByOpenACCDataClause(mlir::Value val) {
   for (auto *user : val.getUsers())
-    if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
+    if (mlir::isa_and_nonnull<mlir::acc::OpenACCDialect>(user->getDialect()))
       return true;
   return false;
 }
@@ -2350,8 +2376,9 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     if (fir::isDerivedTypeWithLenParams(boxTy))
       TODO(loc, "fir.embox codegen of derived with length parameters");
     bool needsDeviceAlloc =
-        isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) ||
-        isUsedByGPULaunchFunc(xbox);
+        isUsedByCUFKernelLaunch(xbox) ||
+        (isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) &&
+         !isUsedByOpenACCDataClause(xbox));
     mlir::Value result = placeInMemoryIfNotGlobalInit(rewriter, loc, boxTy,
                                                       dest, needsDeviceAlloc);
     rewriter.replaceOp(xbox, result);
@@ -2470,8 +2497,9 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     }
     dest = insertBaseAddress(rewriter, loc, dest, base);
     bool needsDeviceAlloc =
-        isDeviceAllocation(rebox.getBox(), adaptor.getBox()) ||
-        isUsedByGPULaunchFunc(rebox);
+        isUsedByCUFKernelLaunch(rebox) ||
+        (isDeviceAllocation(rebox.getBox(), adaptor.getBox()) &&
+         !isUsedByOpenACCDataClause(rebox));
     mlir::Value result = placeInMemoryIfNotGlobalInit(
         rewriter, rebox.getLoc(), destBoxTy, dest, needsDeviceAlloc);
     rewriter.replaceOp(rebox, result);
@@ -3641,7 +3669,7 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
               genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
         }
       }
-      if (!newBoxStorage && isUsedByGPULaunchFunc(load)) {
+      if (!newBoxStorage && isUsedByCUFKernelLaunch(load)) {
         auto mod = load->getParentOfType<mlir::ModuleOp>();
         newBoxStorage =
             genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index cdaa775b5b49c..cb4512676c306 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -402,8 +402,9 @@ module {
 
 // -----
 
-// Test that a rebox whose result is passed to gpu.launch_func gets a managed
-// descriptor so the GPU kernel can access it.
+// Test that a rebox whose result is passed to a CUDA Fortran kernel launch
+// (gpu.launch_func carrying cuf.proc_attr) gets a managed descriptor so the GPU
+// kernel can access it.
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} {
   gpu.module @cuda_device_mod {
@@ -419,7 +420,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
     %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
     %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
     %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
-    gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32)
+    gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32) {cuf.proc_attr = #cuf.cuda_proc<global>}
     return
   }
 }
@@ -427,3 +428,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: llvm.call @_FortranACUFAllocDescriptor(
 // CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel
+
+// -----
+
+// A rebox of a device-resident global (isDeviceAllocation == true) whose result
+// is consumed by an OpenACC data-clause op (here acc.present) must NOT be placed
+// in managed memory. The box is host-side descriptor metadata for the data
+// clause; a managed descriptor created here is never freed and leaves a stale
+// descriptor behind when its address is reused. The descriptor storage should
+// therefore be a plain stack alloca and no _FortranACUFAllocDescriptor call
+// should be emitted for it.
+
+module attributes {gpu.container_module} {
+  fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+    %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+    fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+  }
+  func.func @_QQmain() {
+    %0 = fir.address_of(@_QMm1Eda) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+    %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+    %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
+    %3 = acc.present var(%2 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "uf"}
+    acc.delete accVar(%3 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_present>, name = "uf"}
+    return
+  }
+  gpu.module @cuda_device_mod {
+    fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+      %c0 = arith.constant 0 : index
+      %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+      %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+      fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+    }
+  }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: llvm.alloca {{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor

``````````

</details>


https://github.com/llvm/llvm-project/pull/201957