[flang-commits] [flang] b542c92 - [flang][CUDA] Allocate converted kernel descriptors in device-accessible storage (#201950)
via flang-commits
flang-commits at lists.llvm.org
Sat Jun 6 16:38:27 PDT 2026
Author: Zhen Wang
Date: 2026-06-06T16:38:22-07:00
New Revision: b542c92494b6bd156bacedca2d24fea2215f3792
URL: https://github.com/llvm/llvm-project/commit/b542c92494b6bd156bacedca2d24fea2215f3792
DIFF: https://github.com/llvm/llvm-project/commit/b542c92494b6bd156bacedca2d24fea2215f3792.diff
LOG: [flang][CUDA] Allocate converted kernel descriptors in device-accessible storage (#201950)
Fix CUDA descriptor lowering when an `fir.embox` result reaches a
`gpu.launch_func` through an intermediate `fir.convert`.
CodeGen previously failed to recognize this use chain and could place
the descriptor in host stack storage. Since CUDA kernels may dereference
assumed-shape descriptors on the device, such descriptors must be
allocated through the CUDA descriptor allocation path. Teach the
GPU-launch-use check to look through `fir.convert` so these descriptors
are lowered with `_FortranACUFAllocDescriptor`.
Also adds a regression test for the `fir.embox -> fir.convert ->
gpu.launch_func` case.
Added:
Modified:
flang/lib/Optimizer/CodeGen/CodeGen.cpp
flang/test/Fir/CUDA/cuda-code-gen.mlir
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b1acba393170..3cb66679d3daf 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -65,6 +65,7 @@
#include "mlir/Target/LLVMIR/Import.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/CommandLine.h"
@@ -1580,6 +1581,10 @@ genCUFAllocDescriptor(mlir::Location loc,
.getResult();
}
+static bool isUsedByGPULaunchFunc(mlir::Value val);
+
+static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal);
+
/// Get the address of the type descriptor global variable that was created by
/// lowering for derived type \p recType.
template <typename ModOpTy>
@@ -2092,8 +2097,11 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
"fir.embox codegen of derived with length parameters");
return mlir::failure();
}
- auto result =
- placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy, dest);
+ bool needsDeviceAlloc =
+ isDeviceAllocation(embox.getMemref(), adaptor.getMemref()) ||
+ isUsedByGPULaunchFunc(embox);
+ auto result = placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy,
+ dest, needsDeviceAlloc);
rewriter.replaceOp(embox, result);
return mlir::success();
}
@@ -2102,9 +2110,20 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
/// Return true if any user of \p val is a gpu.launch_func operation,
/// indicating the descriptor must be in device-accessible memory.
static bool isUsedByGPULaunchFunc(mlir::Value val) {
- for (auto *user : val.getUsers())
- if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
- return true;
+ llvm::SmallPtrSet<mlir::Value, 4> visited;
+ llvm::SmallVector<mlir::Value, 4> worklist{val};
+ while (!worklist.empty()) {
+ mlir::Value current = worklist.pop_back_val();
+ if (!visited.insert(current).second)
+ continue;
+
+ for (auto *user : current.getUsers()) {
+ if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
+ return true;
+ if (auto convert = mlir::dyn_cast<fir::ConvertOp>(user))
+ worklist.push_back(convert.getResult());
+ }
+ }
return false;
}
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index cdaa775b5b49c..6d301ce0e8b76 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -427,3 +427,33 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: llvm.call @_FortranACUFAllocDescriptor(
// CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel
+
+// -----
+
+// Test that an embox whose converted result is passed to gpu.launch_func gets a
+// managed descriptor so the GPU kernel can access it.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} {
+ func.func @_QQmain() {
+ %c0_i32 = arith.constant 0 : i32
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ %c20 = arith.constant 20 : index
+ %base = fir.alloca !fir.array<10x20xi32>
+ %box = fircg.ext_embox %base(%c10, %c20) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.box<!fir.array<10x20xi32>>
+ %arg = fir.convert %box : (!fir.box<!fir.array<10x20xi32>>) -> !fir.box<!fir.array<?x?xi32>>
+ gpu.launch_func @cuda_device_mod::@kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%arg : !fir.box<!fir.array<?x?xi32>>) {cuf.proc_attr = #cuf.cuda_proc<global>}
+ return
+ }
+ gpu.module @cuda_device_mod {
+ gpu.func @kernel(%arg0: !fir.box<!fir.array<?x?xi32>>) kernel {
+ gpu.return
+ }
+ }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[DESC:.*]] = llvm.call @_FortranACUFAllocDescriptor(
+// CHECK: %[[BOX:.*]] = builtin.unrealized_conversion_cast %[[DESC]] : !llvm.ptr to !fir.box<!fir.array<?x?xi32>>
+// CHECK: llvm.store %{{.*}}, %[[DESC]]
+// CHECK: gpu.launch_func {{.*}} args(%[[BOX]] : !fir.box<!fir.array<?x?xi32>>)
More information about the flang-commits
mailing list