[flang-commits] [flang] [flang][cuda] Don't allocate managed descriptors for OpenACC data-clause boxes (PR #201957)
via flang-commits
flang-commits at lists.llvm.org
Sun Jun 7 21:40:35 PDT 2026
https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/201957
>From abe4316eb4c8b60276f628275b0eafd591710584 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Sat, 6 Jun 2026 20:56:32 -0700
Subject: [PATCH] [flang][cuda] Don't allocate managed descriptors for OpenACC
data-clause boxes
When lowering fircg.ext_embox/ext_rebox, the resulting Fortran descriptor is
placed in managed memory (via _FortranACUFAllocDescriptor) whenever
isDeviceAllocation() is true, so that device code can access it.
However, when the box result is consumed only by OpenACC data-clause
operations (acc dialect), the descriptor is host metadata: the data-clause
result carries the device data and the box itself does not need to be
device-accessible. Allocating it in managed memory is then unnecessary and
leaks it (it is never freed), which can lead to stale descriptors when the
freed address is later reused.
Add isUsedByOpenACCDataClause and skip managed descriptor allocation for a
device-allocated box that is only consumed by an acc data-clause op. Boxes
used by a GPU kernel launch are unaffected.
---
flang/lib/Optimizer/CodeGen/CodeGen.cpp | 26 ++++++++++++++---
flang/test/Fir/CUDA/cuda-code-gen.mlir | 39 +++++++++++++++++++++++++
2 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 3cb66679d3daf..43aeada16c3f5 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2127,6 +2127,22 @@ static bool isUsedByGPULaunchFunc(mlir::Value val) {
return false;
}
+/// Return true if any user of \p val is an OpenACC data-clause operation (an op
+/// from the `acc` dialect, e.g. `acc.present`, `acc.copyin`, `acc.create`).
+///
+/// Such a box is host-side descriptor metadata for the OpenACC data clause: the
+/// data-clause result carries the device data, so the box itself does not need
+/// to be device-accessible. It must therefore NOT be placed in managed memory
+/// even when the data it describes is device-resident: a managed descriptor
+/// created here is never freed and leaves a stale descriptor behind when its
+/// address is later reused.
+static bool isUsedByOpenACCDataClause(mlir::Value val) {
+ for (auto *user : val.getUsers())
+ if (mlir::isa_and_nonnull<mlir::acc::OpenACCDialect>(user->getDialect()))
+ return true;
+ return false;
+}
+
static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) {
if (val.getDefiningOp() &&
val.getDefiningOp()->getParentOfType<mlir::gpu::GPUModuleOp>())
@@ -2369,8 +2385,9 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
if (fir::isDerivedTypeWithLenParams(boxTy))
TODO(loc, "fir.embox codegen of derived with length parameters");
bool needsDeviceAlloc =
- isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) ||
- isUsedByGPULaunchFunc(xbox);
+ isUsedByGPULaunchFunc(xbox) ||
+ (isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) &&
+ !isUsedByOpenACCDataClause(xbox));
mlir::Value result = placeInMemoryIfNotGlobalInit(rewriter, loc, boxTy,
dest, needsDeviceAlloc);
rewriter.replaceOp(xbox, result);
@@ -2489,8 +2506,9 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
}
dest = insertBaseAddress(rewriter, loc, dest, base);
bool needsDeviceAlloc =
- isDeviceAllocation(rebox.getBox(), adaptor.getBox()) ||
- isUsedByGPULaunchFunc(rebox);
+ isUsedByGPULaunchFunc(rebox) ||
+ (isDeviceAllocation(rebox.getBox(), adaptor.getBox()) &&
+ !isUsedByOpenACCDataClause(rebox));
mlir::Value result = placeInMemoryIfNotGlobalInit(
rewriter, rebox.getLoc(), destBoxTy, dest, needsDeviceAlloc);
rewriter.replaceOp(rebox, result);
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index 6d301ce0e8b76..d1e8154b719ab 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -457,3 +457,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> :
// CHECK: %[[BOX:.*]] = builtin.unrealized_conversion_cast %[[DESC]] : !llvm.ptr to !fir.box<!fir.array<?x?xi32>>
// CHECK: llvm.store %{{.*}}, %[[DESC]]
// CHECK: gpu.launch_func {{.*}} args(%[[BOX]] : !fir.box<!fir.array<?x?xi32>>)
+
+// -----
+
+// A rebox of a device-resident global (isDeviceAllocation == true) whose result
+// is consumed by an OpenACC data-clause op (here acc.present) must NOT be placed
+// in managed memory. The box is host-side descriptor metadata for the data
+// clause; a managed descriptor created here is never freed and leaves a stale
+// descriptor behind when its address is reused. The descriptor storage should
+// therefore be a plain stack alloca and no _FortranACUFAllocDescriptor call
+// should be emitted for it.
+
+module attributes {gpu.container_module} {
+ fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+ %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ }
+ func.func @_QQmain() {
+ %0 = fir.address_of(@_QMm1Eda) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %2 = fircg.ext_rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
+ %3 = acc.present var(%2 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "uf"}
+ acc.delete accVar(%3 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_present>, name = "uf"}
+ return
+ }
+ gpu.module @cuda_device_mod {
+ fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+ %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ }
+ }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: llvm.alloca {{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor
More information about the flang-commits
mailing list