[flang-commits] [flang] b542c92 - [flang][CUDA] Allocate converted kernel descriptors in device-accessible storage (#201950)

Sat Jun 6 16:38:27 PDT 2026

Author: Zhen Wang
Date: 2026-06-06T16:38:22-07:00
New Revision: b542c92494b6bd156bacedca2d24fea2215f3792

URL: https://github.com/llvm/llvm-project/commit/b542c92494b6bd156bacedca2d24fea2215f3792
DIFF: https://github.com/llvm/llvm-project/commit/b542c92494b6bd156bacedca2d24fea2215f3792.diff

LOG: [flang][CUDA] Allocate converted kernel descriptors in device-accessible storage (#201950)

Fix CUDA descriptor lowering when an `fir.embox` result reaches a
`gpu.launch_func` through an intermediate `fir.convert`.

CodeGen previously failed to recognize this use chain and could place
the descriptor in host stack storage. Since CUDA kernels may dereference
assumed-shape descriptors on the device, such descriptors must be
allocated through the CUDA descriptor allocation path. Teach the
GPU-launch-use check to look through `fir.convert` so these descriptors
are lowered with `_FortranACUFAllocDescriptor`.

Also adds a regression test for the `fir.embox -> fir.convert ->
gpu.launch_func` case.

Added: 
    

Modified: 
    flang/lib/Optimizer/CodeGen/CodeGen.cpp
    flang/test/Fir/CUDA/cuda-code-gen.mlir

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b1acba393170..3cb66679d3daf 100644

--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -65,6 +65,7 @@
 #include "mlir/Target/LLVMIR/Import.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -1580,6 +1581,10 @@ genCUFAllocDescriptor(mlir::Location loc,
       .getResult();
 }
 
+static bool isUsedByGPULaunchFunc(mlir::Value val);
+
+static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal);
+
 /// Get the address of the type descriptor global variable that was created by
 /// lowering for derived type \p recType.
 template <typename ModOpTy>
@@ -2092,8 +2097,11 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
            "fir.embox codegen of derived with length parameters");
       return mlir::failure();
     }
-    auto result =
-        placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy, dest);
+    bool needsDeviceAlloc =
+        isDeviceAllocation(embox.getMemref(), adaptor.getMemref()) ||
+        isUsedByGPULaunchFunc(embox);
+    auto result = placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy,
+                                               dest, needsDeviceAlloc);
     rewriter.replaceOp(embox, result);
     return mlir::success();
   }
@@ -2102,9 +2110,20 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
 /// Return true if any user of \p val is a gpu.launch_func operation,
 /// indicating the descriptor must be in device-accessible memory.
 static bool isUsedByGPULaunchFunc(mlir::Value val) {
-  for (auto *user : val.getUsers())
-    if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
-      return true;
+  llvm::SmallPtrSet<mlir::Value, 4> visited;
+  llvm::SmallVector<mlir::Value, 4> worklist{val};
+  while (!worklist.empty()) {
+    mlir::Value current = worklist.pop_back_val();
+    if (!visited.insert(current).second)
+      continue;
+
+    for (auto *user : current.getUsers()) {
+      if (mlir::isa<mlir::gpu::LaunchFuncOp>(user))
+        return true;
+      if (auto convert = mlir::dyn_cast<fir::ConvertOp>(user))
+        worklist.push_back(convert.getResult());
+    }
+  }
   return false;
 }
 

diff  --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index cdaa775b5b49c..6d301ce0e8b76 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -427,3 +427,33 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: llvm.call @_FortranACUFAllocDescriptor(
 // CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel
+
+// -----
+
+// Test that an embox whose converted result is passed to gpu.launch_func gets a
+// managed descriptor so the GPU kernel can access it.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} {
+  func.func @_QQmain() {
+    %c0_i32 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
+    %c20 = arith.constant 20 : index
+    %base = fir.alloca !fir.array<10x20xi32>
+    %box = fircg.ext_embox %base(%c10, %c20) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.box<!fir.array<10x20xi32>>
+    %arg = fir.convert %box : (!fir.box<!fir.array<10x20xi32>>) -> !fir.box<!fir.array<?x?xi32>>
+    gpu.launch_func @cuda_device_mod::@kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%arg : !fir.box<!fir.array<?x?xi32>>) {cuf.proc_attr = #cuf.cuda_proc<global>}
+    return
+  }
+  gpu.module @cuda_device_mod {
+    gpu.func @kernel(%arg0: !fir.box<!fir.array<?x?xi32>>) kernel {
+      gpu.return
+    }
+  }
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[DESC:.*]] = llvm.call @_FortranACUFAllocDescriptor(
+// CHECK: %[[BOX:.*]] = builtin.unrealized_conversion_cast %[[DESC]] : !llvm.ptr to !fir.box<!fir.array<?x?xi32>>
+// CHECK: llvm.store %{{.*}}, %[[DESC]]
+// CHECK: gpu.launch_func {{.*}} args(%[[BOX]] : !fir.box<!fir.array<?x?xi32>>)