[flang-commits] [flang] [flang][cuda] Pass assumed-shape arrays of bind(c) attributes(global) kernels by base address (PR #199313)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Sat May 23 10:23:55 PDT 2026
https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/199313
>From 21ddaedb3e1dae1b5938b4e0aa8f001b0b541302 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:32:16 -0700
Subject: [PATCH 1/3] Pass assumed-shape arrays of bind(c) attributes(global)
kernels by base address
---
flang/lib/Lower/CallInterface.cpp | 26 +++-
.../Lower/CUDA/cuda-kernel-bindc-shape.cuf | 122 ++++++++++++++++++
2 files changed, 146 insertions(+), 2 deletions(-)
create mode 100644 flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e9059581c690a..9f57e72079577 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -873,6 +873,12 @@ class Fortran::lower::CallInterfaceImpl {
void buildExplicitInterface(
const Fortran::evaluate::characteristics::Procedure &procedure) {
bool isBindC = procedure.IsBindC();
+ bool isCudaGlobalKernel =
+ procedure.cudaSubprogramAttrs &&
+ (*procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Global ||
+ *procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Grid_Global);
// Handle result
if (const std::optional<Fortran::evaluate::characteristics::FunctionResult>
&result = procedure.functionResult) {
@@ -899,7 +905,7 @@ class Fortran::lower::CallInterfaceImpl {
handleImplicitDummy(&argCharacteristics, dummy, entity);
else
handleExplicitDummy(&argCharacteristics, dummy, entity,
- isBindC);
+ isBindC, isCudaGlobalKernel);
},
[&](const Fortran::evaluate::characteristics::DummyProcedure
&dummy) {
@@ -1111,7 +1117,7 @@ class Fortran::lower::CallInterfaceImpl {
void handleExplicitDummy(
const DummyCharacteristics *characteristics,
const Fortran::evaluate::characteristics::DummyDataObject &obj,
- const FortranEntity &entity, bool isBindC) {
+ const FortranEntity &entity, bool isBindC, bool isCudaGlobalKernel) {
using Attrs = Fortran::evaluate::characteristics::DummyDataObject::Attr;
bool isValueAttr = false;
@@ -1171,6 +1177,22 @@ class Fortran::lower::CallInterfaceImpl {
attrs);
addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
} else if (obj.IsPassedByDescriptor(isBindC)) {
+ // bind(c) attributes(global): pass assumed-shape arrays by base address
+ // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+ // corank > 0 is already TODO'd above, so no need to re-check here.
+ using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+ constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
+ shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
+ ShapeAttr::AssumedRank};
+ if (isBindC && isCudaGlobalKernel &&
+ (obj.type.attrs() & shapeOnlyDescriptor).any() &&
+ !obj.type.type().IsPolymorphic()) {
+ mlir::Type passType = fir::ReferenceType::get(type);
+ addFirOperand(passType, nextPassedArgPosition(), Property::BaseAddress,
+ attrs);
+ addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+ return;
+ }
// Pass as fir.box or fir.class
addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs);
addPassedArg(PassEntityBy::Box, entity, characteristics);
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
new file mode 100644
index 0000000000000..0d4d48ae3571d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -0,0 +1,122 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test that bind(c) attributes(global) kernels with assumed-shape /
+! deferred-shape / assumed-rank array dummies are lowered to a raw base
+! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
+! C-interop ABI of these arguments must be a plain pointer.
+
+module m
+contains
+
+ ! ---- Positive case 1: assumed-shape, 1-D ----------------------------
+ subroutine call_kernel_1d(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine k1d(a, n) bind(c, name='k1d')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ integer, value :: n
+ end subroutine
+ end interface
+ call k1d<<<8, 128>>>(d, 1024)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_1d
+! CHECK: cuf.kernel_launch @k1d<<<{{.*}}>>>(%{{.*}}, %{{.*}}) : (!fir.ref<!fir.array<?xf32>>, i32)
+
+ ! ---- Positive case 2: assumed-shape, 2-D ----------------------------
+ subroutine call_kernel_2d(d)
+ real, device :: d(:,:)
+ interface
+ attributes(global) subroutine k2d(a) bind(c, name='k2d')
+ use iso_c_binding
+ real(c_float), dimension(:,:), device :: a
+ end subroutine
+ end interface
+ call k2d<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_2d
+! CHECK: cuf.kernel_launch @k2d<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?x?xf32>>)
+
+ ! ---- Positive case 3: grid_global -----------------------------------
+ subroutine call_kernel_grid_global(d)
+ real, device :: d(:)
+ interface
+ attributes(grid_global) subroutine kgg(a) bind(c, name='kgg')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ end subroutine
+ end interface
+ call kgg<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
+! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 1: attributes(global) without bind(c) -----------
+ ! Pure CUDA Fortran kernel; descriptor ABI preserved.
+ subroutine call_kernel_no_bindc(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine knobindc(a)
+ real, dimension(:), device :: a
+ end subroutine
+ end interface
+ call knobindc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_no_bindc
+! CHECK: cuf.kernel_launch @_QPknobindc<<<{{.*}}>>>(%{{.*}}) : (!fir.box<!fir.array<?xf32>>)
+
+ ! ---- Negative case 2: bind(c) but not attributes(global) ----------
+ ! Regular C-interop function (host); descriptor ABI preserved.
+ subroutine call_host_bindc(d)
+ real :: d(:)
+ interface
+ subroutine hostfn(a) bind(c, name='hostfn')
+ use iso_c_binding
+ real(c_float), dimension(:) :: a
+ end subroutine
+ end interface
+ call hostfn(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_host_bindc
+! CHECK: fir.call @hostfn(%{{.*}}) {{.*}}: (!fir.box<!fir.array<?xf32>>) -> ()
+
+ ! ---- Negative case 3: bind(c) attributes(global) + dimension(*) -----
+ ! Already a base-address ABI; just confirm it didn't regress.
+ subroutine call_kernel_assumed_size(d)
+ real, device :: d(*)
+ interface
+ attributes(global) subroutine kstar(a) bind(c, name='kstar')
+ use iso_c_binding
+ real(c_float), dimension(*), device :: a
+ end subroutine
+ end interface
+ call kstar<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_size
+! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 4: allocatable assumed-shape -------------------
+ ! Allocatable is handled before the descriptor branch; remains MutableBox
+ ! at the kernel-launch site.
+ subroutine call_kernel_allocatable(d)
+ real, allocatable, device :: d(:)
+ interface
+ attributes(global) subroutine kalloc(a) bind(c, name='kalloc')
+ use iso_c_binding
+ real(c_float), dimension(:), allocatable, device :: a
+ end subroutine
+ end interface
+ call kalloc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
+! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+end module
>From a00da371c511d0b5e155c0536f9ecdc8060e4969 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:40:23 -0700
Subject: [PATCH 2/3] update comment
---
flang/lib/Lower/CallInterface.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 9f57e72079577..14e45563ee96f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1179,7 +1179,6 @@ class Fortran::lower::CallInterfaceImpl {
} else if (obj.IsPassedByDescriptor(isBindC)) {
// bind(c) attributes(global): pass assumed-shape arrays by base address
// (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
- // corank > 0 is already TODO'd above, so no need to re-check here.
using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
>From 56822cc76940ef76922c0ef13a89e2c7a7ff2b1e Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Sat, 23 May 2026 10:06:51 -0700
Subject: [PATCH 3/3] add doc, add test
---
flang/docs/CUDA.md | 51 +++++++++++++++++++
flang/docs/index.md | 1 +
.../Lower/CUDA/cuda-kernel-bindc-shape.cuf | 37 +++++++++++++-
3 files changed, 87 insertions(+), 2 deletions(-)
create mode 100644 flang/docs/CUDA.md
diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
new file mode 100644
index 0000000000000..fdaaae6632e64
--- /dev/null
+++ b/flang/docs/CUDA.md
@@ -0,0 +1,51 @@
+<!--===- docs/CUDA.md
+
+ Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ See https://llvm.org/LICENSE.txt for license information.
+ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
+# CUDA Fortran lowering notes
+
+```{contents}
+---
+local:
+---
+```
+
+Catalog of CUDA Fortran lowering decisions in Flang that diverge from the
+Fortran 2018 standard, for cases the [CUDA Fortran Programming
+Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
+does not specify.
+
+## `BIND(C) ATTRIBUTES(GLOBAL)` shape-only dummy arrays
+
+For a `BIND(C)` procedure with `ATTRIBUTES(GLOBAL)` or
+`ATTRIBUTES(GRID_GLOBAL)`, dummies whose only descriptor requirement is shape
+(`AssumedShape`, `DeferredShape`, or `AssumedRank`) are passed by base address
+(`!fir.ref<T>`) instead of by `CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE`,
+`POINTER`, polymorphic, coarray, and `BIND(C)` assumed-length character dummies
+are unaffected and keep the standard Fortran 2018 lowering.
+
+```fortran
+interface
+ attributes(global) subroutine f(d, n) bind(c, name='f')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: d
+ integer, value :: n
+ end subroutine
+end interface
+! interoperates with: extern "C" __global__ void f(float *d, int n);
+```
+
+Reason: Fortran 2018 ยง18.3.7 prescribes a CFI descriptor for these shape
+attributes under `BIND(C)`, but the
+[`cudaLaunchKernel`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html)
+ABI requires `args[i]` to point to a value of the type the C kernel declares
+for parameter `i`. A descriptor pointer in `args[0]` would be dereferenced as
+`T *` on the device, accessing host descriptor memory and producing illegal
+accesses.
+
+Implementation: `flang/lib/Lower/CallInterface.cpp` in `handleExplicitDummy`,
+gated on `procedure.cudaSubprogramAttrs` and the dummy's shape attributes.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 2e7150188c8d4..c300b8a215417 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -65,6 +65,7 @@ on how to get in touch with us and to learn more about the current status.
Character
ComplexOperations
ControlFlowGraph
+ CUDA
DebugGeneration
DoConcurrent
DoConcurrentConversionToOpenMP
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
index 0d4d48ae3571d..7f3687244cfdc 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -55,6 +55,23 @@ contains
! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+ ! ---- Positive case 4: assumed-rank ----------------------------------
+ ! AssumedRank shape attribute requires a descriptor under the standard
+ ! BIND(C) ABI; the override forces a base address for the CUDA-C ABI.
+ subroutine call_kernel_assumed_rank(d)
+ real, device :: d(..)
+ interface
+ attributes(global) subroutine krank(a) bind(c, name='krank')
+ use iso_c_binding
+ real(c_float), dimension(..), device :: a
+ end subroutine
+ end interface
+ call krank<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_rank
+! CHECK: cuf.kernel_launch @krank<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<{{.*}}>)
+
! ---- Negative case 1: attributes(global) without bind(c) -----------
! Pure CUDA Fortran kernel; descriptor ABI preserved.
subroutine call_kernel_no_bindc(d)
@@ -103,8 +120,8 @@ contains
! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
! ---- Negative case 4: allocatable assumed-shape -------------------
- ! Allocatable is handled before the descriptor branch; remains MutableBox
- ! at the kernel-launch site.
+ ! Allocatable goes through the MutableBox branch in handleExplicitDummy
+ ! before the override is reachable.
subroutine call_kernel_allocatable(d)
real, allocatable, device :: d(:)
interface
@@ -119,4 +136,20 @@ contains
! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+ ! ---- Negative case 5: pointer assumed-shape -----------------------
+ ! Pointer also goes through the MutableBox branch, same as allocatable.
+ subroutine call_kernel_pointer(d)
+ real, pointer, device :: d(:)
+ interface
+ attributes(global) subroutine kptr(a) bind(c, name='kptr')
+ use iso_c_binding
+ real(c_float), dimension(:), pointer, device :: a
+ end subroutine
+ end interface
+ call kptr<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_pointer
+! CHECK: cuf.kernel_launch @kptr<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+
end module
More information about the flang-commits
mailing list