[flang-commits] [flang] [flang][cuda] Don't allocate managed descriptors for OpenACC data-clause boxes (PR #201957)

Fri Jun 5 15:49:50 PDT 2026

https://github.com/khaki3 created https://github.com/llvm/llvm-project/pull/201957

Example:
```fortran
module m
  real(8), allocatable :: u(:,:,:,:)   ! device-resident allocatable
contains
  subroutine core(uf, n)
    integer :: n
    real(8) :: uf(n,n,n,5)
    !$acc kernels loop present(uf)      ! descriptor built for uf
    ...
  end subroutine
  subroutine wrap(n)
    call core(u, n)
  end subroutine
end module
```

In this code, the descriptor for `present(uf)` is only used by `acc` data-clause ops (the kernel gets the clause result, not the box), so it never needs to be device-accessible. But `isDeviceAllocation()` is true, so it's allocated via `_FortranACUFAllocDescriptor` in managed memory and never freed — leaking it and leaving stale descriptors when the address is reused.

Fix: allocate a descriptor in managed memory only when truly device-accessible — used by a CUDA Fortran kernel launch (`gpu.launch_func` with `cuf.proc_attr`), or device-allocated and not consumed by an `acc` data-clause op. Rename `isUsedByGPULaunchFunc` → `isUsedByCUFKernelLaunch` and add `isUsedByOpenACCDataClause`.

>From 68f3694b26e87d2cd26233b4f4300934bd256666 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 5 Jun 2026 15:32:06 -0700
Subject: [PATCH] [flang][cuda] Don't allocate managed descriptors for OpenACC
 data-clause boxes

When lowering fircg.ext_embox/ext_rebox, the resulting Fortran descriptor is
placed in managed memory (via _FortranACUFAllocDescriptor) whenever
isDeviceAllocation() is true, so that device code can access it.

However, when the box result is consumed only by OpenACC data-clause
operations (acc dialect), the descriptor is host metadata: the data-clause
result carries the device data and the box itself does not need to be
device-accessible. Allocating it in managed memory is then unnecessary and
leaks it (it is never freed), which can lead to stale descriptors when the
freed address is later reused.

Restrict managed descriptor allocation to descriptors that are genuinely
device-accessible: those used by a CUDA Fortran kernel launch
(gpu.launch_func carrying cuf.proc_attr, as set by CUFLaunchAttachAttr), or
device-allocated and not consumed by an acc data-clause op. Rename
isUsedByGPULaunchFunc to isUsedByCUFKernelLaunch (now gated on cuf.proc_attr)
and add isUsedByOpenACCDataClause.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp | 46 ++++++++++++++++++++-----
 flang/test/Fir/CUDA/cuda-code-gen.mlir  | 46 +++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b1acba393170..cb77cde48fd55 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2099,11 +2099,37 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
   }
 };
 
-/// Return true if any user of \p val is a gpu.launch_func operation,
-/// indicating the descriptor must be in device-accessible memory.
-static bool isUsedByGPULaunchFunc(mlir::Value val) {
+/// Return true if any user of \p val is a CUDA Fortran kernel launch
+/// (a gpu.launch_func carrying the `cuf.proc_attr` attribute set by
+/// CUFLaunchAttachAttr). For such launches the descriptor is passed by
+/// reference to the device kernel, so its storage must be device-accessible
+/// (managed) memory.
+///
+/// OpenACC kernel launches are also represented as gpu.launch_func at this
+/// point, but their box arguments are handled through the OpenACC data-clause
+/// lowering and do not require a device-accessible descriptor; forcing those
+/// into managed memory is unnecessary and leaks the descriptor (it is never
+/// freed). Hence only CUDA Fortran launches are considered here.
+static bool isUsedByCUFKernelLaunch(mlir::Value val) {
+  for (auto *user : val.getUsers())
+    if (auto launch = mlir::dyn_cast<mlir::gpu::LaunchFuncOp>(user))
+      if (launch->hasAttr(cuf::getProcAttrName()))
+        return true;
+  return false;
+}
+
+/// Return true if any user of \p val is an OpenACC data-clause operation (an op
+/// from the `acc` dialect, e.g. `acc.present`, `acc.copyin`, `acc.create`).
+///
+/// Such a box is host-side descriptor metadata for the OpenACC data clause: the
+/// data-clause result carries the device data, so the box itself does not need
+/// to be device-accessible. It must therefore NOT be placed in managed memory
+/// even when the data it describes is device-resident: a managed descriptor
+/// created here is never freed and leaves a stale descriptor behind when its
+/// address is later reused.
+static bool isUsedByOpenACCDataClause(mlir::Value val) {
   for (auto *user : val.getUsers())
-    if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
+    if (mlir::isa_and_nonnull<mlir::acc::OpenACCDialect>(user->getDialect()))
       return true;
   return false;
 }
@@ -2350,8 +2376,9 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     if (fir::isDerivedTypeWithLenParams(boxTy))
       TODO(loc, "fir.embox codegen of derived with length parameters");
     bool needsDeviceAlloc =
-        isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) ||
-        isUsedByGPULaunchFunc(xbox);
+        isUsedByCUFKernelLaunch(xbox) ||
+        (isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) &&
+         !isUsedByOpenACCDataClause(xbox));
     mlir::Value result = placeInMemoryIfNotGlobalInit(rewriter, loc, boxTy,
                                                       dest, needsDeviceAlloc);
     rewriter.replaceOp(xbox, result);
@@ -2470,8 +2497,9 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     }
     dest = insertBaseAddress(rewriter, loc, dest, base);
     bool needsDeviceAlloc =
-        isDeviceAllocation(rebox.getBox(), adaptor.getBox()) ||
-        isUsedByGPULaunchFunc(rebox);
+        isUsedByCUFKernelLaunch(rebox) ||
+        (isDeviceAllocation(rebox.getBox(), adaptor.getBox()) &&
+         !isUsedByOpenACCDataClause(rebox));
     mlir::Value result = placeInMemoryIfNotGlobalInit(
         rewriter, rebox.getLoc(), destBoxTy, dest, needsDeviceAlloc);
     rewriter.replaceOp(rebox, result);
@@ -3641,7 +3669,7 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
               genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
         }
       }
-      if (!newBoxStorage && isUsedByGPULaunchFunc(load)) {
+      if (!newBoxStorage && isUsedByCUFKernelLaunch(load)) {
         auto mod = load->getParentOfType<mlir::ModuleOp>();
         newBoxStorage =
             genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy());
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index cdaa775b5b49c..cb4512676c306 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -402,8 +402,9 @@ module {
 
 // -----
 
-// Test that a rebox whose result is passed to gpu.launch_func gets a managed
-// descriptor so the GPU kernel can access it.
+// Test that a rebox whose result is passed to a CUDA Fortran kernel launch
+// (gpu.launch_func carrying cuf.proc_attr) gets a managed descriptor so the GPU
+// kernel can access it.
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} {
   gpu.module @cuda_device_mod {
@@ -419,7 +420,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
     %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
     %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
     %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
-    gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32)
+    gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box<!fir.array<?x?xi32>>, %c32_i32 : i32) {cuf.proc_attr = #cuf.cuda_proc<global>}
     return
   }
 }
@@ -427,3 +428,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: llvm.call @_FortranACUFAllocDescriptor(
 // CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel
+
+// -----
+
+// A rebox of a device-resident global (isDeviceAllocation == true) whose result
+// is consumed by an OpenACC data-clause op (here acc.present) must NOT be placed
+// in managed memory. The box is host-side descriptor metadata for the data
+// clause; a managed descriptor created here is never freed and leaves a stale
+// descriptor behind when its address is reused. The descriptor storage should
+// therefore be a plain stack alloca and no _FortranACUFAllocDescriptor call
+// should be emitted for it.
+
+module attributes {gpu.container_module} {
+  fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+    %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+    fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+  }
+  func.func @_QQmain() {
+    %0 = fir.address_of(@_QMm1Eda) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+    %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+    %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
+    %3 = acc.present var(%2 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "uf"}
+    acc.delete accVar(%3 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_present>, name = "uf"}
+    return
+  }
+  gpu.module @cuda_device_mod {
+    fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+      %c0 = arith.constant 0 : index
+      %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+      %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+      fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+    }
+  }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: llvm.alloca {{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor