[flang-commits] [flang] [flang][cuda] Place box value kernel args in managed memory (PR #197116)

Tue May 12 01:09:33 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: khaki3

<details>
<summary>Changes</summary>

Example:
```fortran
type deviceArray
  integer, allocatable, dimension(:,:), device :: Arr
end type deviceArray
type(deviceArray), allocatable, dimension(:) :: DA

allocate(DA(2))
allocate(DA(1)%Arr(32,32))
call mykernel<<<1,32>>>(DA(1)%Arr, 32)  ! cudaErrorIllegalAddress
```

In this code, `DA(1)%Arr` is a device allocatable component inside a managed derived type. The compiler loads the descriptor, reboxes it on the host stack, and passes it to `cuf.kernel_launch`. Since `!fir.box` is lowered to a pointer in LLVM IR, the kernel receives a host-stack pointer it cannot dereference — causing `cudaErrorIllegalAddress`. The existing code only handled descriptors from global device variables.

**Fix:** For `!fir.box` value arguments in `CUFLaunchOpConversion`, allocate a managed descriptor via `_FortranACUFAllocDescriptor`, store the host box into it, pass the managed ref to the kernel, and free it after the launch.

---
Full diff: https://github.com/llvm/llvm-project/pull/197116.diff


2 Files Affected:

- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp (+58-4) 
- (modified) flang/test/Fir/CUDA/cuda-launch.fir (+31) 


``````````diff

diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
index b756b07457b74..03bda4b502b07 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
@@ -369,8 +369,11 @@ struct CUFLaunchOpConversion
   using OpRewritePattern::OpRewritePattern;
 
   CUFLaunchOpConversion(mlir::MLIRContext *context,
-                        const mlir::SymbolTable &symTab)
-      : OpRewritePattern(context), symTab{symTab} {}
+                        const mlir::SymbolTable &symTab,
+                        const mlir::DataLayout *dl,
+                        const fir::LLVMTypeConverter *converter)
+      : OpRewritePattern(context), symTab{symTab}, dl{dl},
+        converter{converter} {}
 
   mlir::LogicalResult
   matchAndRewrite(cuf::KernelLaunchOp op,
@@ -413,7 +416,10 @@ struct CUFLaunchOpConversion
       procAttr =
           funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
     }
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
     llvm::SmallVector<mlir::Value> args;
+    llvm::SmallVector<mlir::Value> tempDescriptors;
     for (mlir::Value arg : op.getArgs()) {
       // If the argument is a global descriptor, make sure we pass the device
       // copy of this descriptor and not the host one.
@@ -433,6 +439,35 @@ struct CUFLaunchOpConversion
             }
           }
         }
+        // Box value arguments need to be placed in managed memory so
+        // the GPU kernel can access the descriptor. !fir.box is lowered
+        // to a pointer, so passing a host-stack box would give the
+        // kernel a host pointer it cannot dereference.
+        if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(arg.getType())) {
+          auto refTy = fir::ReferenceType::get(boxTy);
+          mlir::func::FuncOp allocFunc =
+              fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDescriptor)>(
+                  loc, builder);
+          auto allocFTy = allocFunc.getFunctionType();
+          mlir::Value sourceFile =
+              fir::factory::locationToFilename(builder, loc);
+          mlir::Value sourceLine = fir::factory::locationToLineNo(
+              builder, loc, allocFTy.getInput(2));
+          mlir::Type structTy = converter->convertBoxTypeAsStruct(boxTy);
+          std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+          mlir::Value sizeInBytes = builder.createIntegerConstant(
+              loc, builder.getIndexType(), boxSize);
+          llvm::SmallVector<mlir::Value> allocArgs{
+              fir::runtime::createArguments(builder, loc, allocFTy, sizeInBytes,
+                                            sourceFile, sourceLine)};
+          auto allocCall =
+              fir::CallOp::create(builder, loc, allocFunc, allocArgs);
+          auto managedRef =
+              builder.createConvert(loc, refTy, allocCall.getResult(0));
+          fir::StoreOp::create(builder, loc, arg, managedRef);
+          tempDescriptors.push_back(managedRef);
+          arg = managedRef;
+        }
       }
       args.push_back(arg);
     }
@@ -461,12 +496,30 @@ struct CUFLaunchOpConversion
       gpuLaunchOp->setAttr(cuf::getProcAttrName(),
                            cuf::ProcAttributeAttr::get(
                                op.getContext(), cuf::ProcAttribute::Global));
-    rewriter.replaceOp(op, gpuLaunchOp);
+    // Free temporary managed descriptors allocated for box value arguments.
+    if (!tempDescriptors.empty()) {
+      rewriter.setInsertionPointAfter(gpuLaunchOp);
+      for (mlir::Value desc : tempDescriptors) {
+        mlir::func::FuncOp freeFunc =
+            fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDescriptor)>(loc,
+                                                                     builder);
+        auto freeFTy = freeFunc.getFunctionType();
+        mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+        mlir::Value sourceLine =
+            fir::factory::locationToLineNo(builder, loc, freeFTy.getInput(2));
+        llvm::SmallVector<mlir::Value> freeArgs{fir::runtime::createArguments(
+            builder, loc, freeFTy, desc, sourceFile, sourceLine)};
+        fir::CallOp::create(builder, loc, freeFunc, freeArgs);
+      }
+    }
+    rewriter.eraseOp(op);
     return mlir::success();
   }
 
 private:
   const mlir::SymbolTable &symTab;
+  const mlir::DataLayout *dl;
+  const fir::LLVMTypeConverter *converter;
 };
 
 struct CUFSyncDescriptorOpConversion
@@ -560,7 +613,8 @@ void cuf::populateCUFToFIRConversionPatterns(
   patterns.insert<CUFSyncDescriptorOpConversion>(patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
-  patterns.insert<CUFLaunchOpConversion>(patterns.getContext(), symtab);
+  patterns.insert<CUFLaunchOpConversion>(patterns.getContext(), symtab, &dl,
+                                         &converter);
 }
 
 void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab,
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 92db6ecaadc45..8269df9befdce 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -156,3 +156,34 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 // CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
 // CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
+
+// -----
+
+// Test that box value arguments to kernel launches are placed in managed memory.
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMtestmePmykernel(%arg0: !fir.ref<!fir.box<!fir.array<?x?xi32>>>, %arg1: i32) kernel {
+      gpu.return
+    }
+  }
+  func.func @_QMtestmePmykernel(%arg0: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: i32) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    return
+  }
+  func.func @_QQmain() {
+    %c1_i32 = arith.constant 1 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
+    %1 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
+    %2 = fir.rebox %1 : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
+    cuf.kernel_launch @_QMtestmePmykernel<<<%c1_i32, %c1_i32, %c1_i32, %c32_i32, %c1_i32, %c1_i32>>>(%2, %c32_i32) : (!fir.box<!fir.array<?x?xi32>>, i32)
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: %[[BOX:.*]] = fir.rebox
+// CHECK: %[[ALLOC_DESC:.*]] = fir.call @_FortranACUFAllocDescriptor(
+// CHECK: %[[MANAGED_REF:.*]] = fir.convert %[[ALLOC_DESC]]
+// CHECK: fir.store %[[BOX]] to %[[MANAGED_REF]]
+// CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel {{.*}} args(%[[MANAGED_REF]] : !fir.ref<!fir.box<!fir.array<?x?xi32>>>, %{{.*}} : i32)
+// CHECK: fir.call @_FortranACUFFreeDescriptor(

``````````

</details>


https://github.com/llvm/llvm-project/pull/197116