[flang-commits] [flang] [flang][cuda] Pass assumed-shape arrays of bind(c) attributes(global) kernels by base address (PR #199313)
via flang-commits
flang-commits at lists.llvm.org
Fri May 22 20:41:34 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Zhen Wang (wangzpgi)
<details>
<summary>Changes</summary>
For a `bind(c) attributes(global)` kernel with an assumed-shape array dummy, the dummy is lowered as a CFI descriptor (`!fir.box<...>`). `cudaLaunchKernel` cannot deliver a descriptor to the device, so the C `__global__` reads a pointer to a 48-byte descriptor as `float *` and writes garbage.
Pass these dummies as a base address (`!fir.ref<!fir.array<?xT>>`) instead, matching what `dimension(*)` already does. Allocatable / pointer / polymorphic are unaffected.
Before — for `mykernel(d_arr, n) bind(c) attributes(global)` taking `real(c_float), dimension(:), device :: d_arr`:
```
cuf.kernel_launch @<!-- -->mykernel<<<...>>>(%box, %n) : (!fir.box<!fir.array<?xf32>>, i32)
```
After:
```
%data = fir.box_addr %src : (!fir.box<...>) -> !fir.ref<!fir.array<?xf32>>
cuf.kernel_launch @<!-- -->mykernel<<<...>>>(%data, %n) : (!fir.ref<!fir.array<?xf32>>, i32)
```
---
Full diff: https://github.com/llvm/llvm-project/pull/199313.diff
2 Files Affected:
- (modified) flang/lib/Lower/CallInterface.cpp (+23-2)
- (added) flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf (+122)
``````````diff
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e9059581c690a..14e45563ee96f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -873,6 +873,12 @@ class Fortran::lower::CallInterfaceImpl {
void buildExplicitInterface(
const Fortran::evaluate::characteristics::Procedure &procedure) {
bool isBindC = procedure.IsBindC();
+ bool isCudaGlobalKernel =
+ procedure.cudaSubprogramAttrs &&
+ (*procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Global ||
+ *procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Grid_Global);
// Handle result
if (const std::optional<Fortran::evaluate::characteristics::FunctionResult>
&result = procedure.functionResult) {
@@ -899,7 +905,7 @@ class Fortran::lower::CallInterfaceImpl {
handleImplicitDummy(&argCharacteristics, dummy, entity);
else
handleExplicitDummy(&argCharacteristics, dummy, entity,
- isBindC);
+ isBindC, isCudaGlobalKernel);
},
[&](const Fortran::evaluate::characteristics::DummyProcedure
&dummy) {
@@ -1111,7 +1117,7 @@ class Fortran::lower::CallInterfaceImpl {
void handleExplicitDummy(
const DummyCharacteristics *characteristics,
const Fortran::evaluate::characteristics::DummyDataObject &obj,
- const FortranEntity &entity, bool isBindC) {
+ const FortranEntity &entity, bool isBindC, bool isCudaGlobalKernel) {
using Attrs = Fortran::evaluate::characteristics::DummyDataObject::Attr;
bool isValueAttr = false;
@@ -1171,6 +1177,21 @@ class Fortran::lower::CallInterfaceImpl {
attrs);
addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
} else if (obj.IsPassedByDescriptor(isBindC)) {
+ // bind(c) attributes(global): pass assumed-shape arrays by base address
+ // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+ using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+ constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
+ shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
+ ShapeAttr::AssumedRank};
+ if (isBindC && isCudaGlobalKernel &&
+ (obj.type.attrs() & shapeOnlyDescriptor).any() &&
+ !obj.type.type().IsPolymorphic()) {
+ mlir::Type passType = fir::ReferenceType::get(type);
+ addFirOperand(passType, nextPassedArgPosition(), Property::BaseAddress,
+ attrs);
+ addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+ return;
+ }
// Pass as fir.box or fir.class
addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs);
addPassedArg(PassEntityBy::Box, entity, characteristics);
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
new file mode 100644
index 0000000000000..0d4d48ae3571d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -0,0 +1,122 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test that bind(c) attributes(global) kernels with assumed-shape /
+! deferred-shape / assumed-rank array dummies are lowered to a raw base
+! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
+! C-interop ABI of these arguments must be a plain pointer.
+
+module m
+contains
+
+ ! ---- Positive case 1: assumed-shape, 1-D ----------------------------
+ subroutine call_kernel_1d(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine k1d(a, n) bind(c, name='k1d')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ integer, value :: n
+ end subroutine
+ end interface
+ call k1d<<<8, 128>>>(d, 1024)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_1d
+! CHECK: cuf.kernel_launch @k1d<<<{{.*}}>>>(%{{.*}}, %{{.*}}) : (!fir.ref<!fir.array<?xf32>>, i32)
+
+ ! ---- Positive case 2: assumed-shape, 2-D ----------------------------
+ subroutine call_kernel_2d(d)
+ real, device :: d(:,:)
+ interface
+ attributes(global) subroutine k2d(a) bind(c, name='k2d')
+ use iso_c_binding
+ real(c_float), dimension(:,:), device :: a
+ end subroutine
+ end interface
+ call k2d<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_2d
+! CHECK: cuf.kernel_launch @k2d<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?x?xf32>>)
+
+ ! ---- Positive case 3: grid_global -----------------------------------
+ subroutine call_kernel_grid_global(d)
+ real, device :: d(:)
+ interface
+ attributes(grid_global) subroutine kgg(a) bind(c, name='kgg')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ end subroutine
+ end interface
+ call kgg<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
+! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 1: attributes(global) without bind(c) -----------
+ ! Pure CUDA Fortran kernel; descriptor ABI preserved.
+ subroutine call_kernel_no_bindc(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine knobindc(a)
+ real, dimension(:), device :: a
+ end subroutine
+ end interface
+ call knobindc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_no_bindc
+! CHECK: cuf.kernel_launch @_QPknobindc<<<{{.*}}>>>(%{{.*}}) : (!fir.box<!fir.array<?xf32>>)
+
+ ! ---- Negative case 2: bind(c) but not attributes(global) ----------
+ ! Regular C-interop function (host); descriptor ABI preserved.
+ subroutine call_host_bindc(d)
+ real :: d(:)
+ interface
+ subroutine hostfn(a) bind(c, name='hostfn')
+ use iso_c_binding
+ real(c_float), dimension(:) :: a
+ end subroutine
+ end interface
+ call hostfn(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_host_bindc
+! CHECK: fir.call @hostfn(%{{.*}}) {{.*}}: (!fir.box<!fir.array<?xf32>>) -> ()
+
+ ! ---- Negative case 3: bind(c) attributes(global) + dimension(*) -----
+ ! Already a base-address ABI; just confirm it didn't regress.
+ subroutine call_kernel_assumed_size(d)
+ real, device :: d(*)
+ interface
+ attributes(global) subroutine kstar(a) bind(c, name='kstar')
+ use iso_c_binding
+ real(c_float), dimension(*), device :: a
+ end subroutine
+ end interface
+ call kstar<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_size
+! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 4: allocatable assumed-shape -------------------
+ ! Allocatable is handled before the descriptor branch; remains MutableBox
+ ! at the kernel-launch site.
+ subroutine call_kernel_allocatable(d)
+ real, allocatable, device :: d(:)
+ interface
+ attributes(global) subroutine kalloc(a) bind(c, name='kalloc')
+ use iso_c_binding
+ real(c_float), dimension(:), allocatable, device :: a
+ end subroutine
+ end interface
+ call kalloc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
+! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+end module
``````````
</details>
https://github.com/llvm/llvm-project/pull/199313
More information about the flang-commits
mailing list