[flang-commits] [flang] [flang][cuda] Pass assumed-shape arrays of bind(c) attributes(global) kernels by base address (PR #199313)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Tue May 26 10:59:37 PDT 2026
https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/199313
>From 21ddaedb3e1dae1b5938b4e0aa8f001b0b541302 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:32:16 -0700
Subject: [PATCH 1/6] Pass assumed-shape arrays of bind(c) attributes(global)
kernels by base address
---
flang/lib/Lower/CallInterface.cpp | 26 +++-
.../Lower/CUDA/cuda-kernel-bindc-shape.cuf | 122 ++++++++++++++++++
2 files changed, 146 insertions(+), 2 deletions(-)
create mode 100644 flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e9059581c690a..9f57e72079577 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -873,6 +873,12 @@ class Fortran::lower::CallInterfaceImpl {
void buildExplicitInterface(
const Fortran::evaluate::characteristics::Procedure &procedure) {
bool isBindC = procedure.IsBindC();
+ bool isCudaGlobalKernel =
+ procedure.cudaSubprogramAttrs &&
+ (*procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Global ||
+ *procedure.cudaSubprogramAttrs ==
+ Fortran::common::CUDASubprogramAttrs::Grid_Global);
// Handle result
if (const std::optional<Fortran::evaluate::characteristics::FunctionResult>
&result = procedure.functionResult) {
@@ -899,7 +905,7 @@ class Fortran::lower::CallInterfaceImpl {
handleImplicitDummy(&argCharacteristics, dummy, entity);
else
handleExplicitDummy(&argCharacteristics, dummy, entity,
- isBindC);
+ isBindC, isCudaGlobalKernel);
},
[&](const Fortran::evaluate::characteristics::DummyProcedure
&dummy) {
@@ -1111,7 +1117,7 @@ class Fortran::lower::CallInterfaceImpl {
void handleExplicitDummy(
const DummyCharacteristics *characteristics,
const Fortran::evaluate::characteristics::DummyDataObject &obj,
- const FortranEntity &entity, bool isBindC) {
+ const FortranEntity &entity, bool isBindC, bool isCudaGlobalKernel) {
using Attrs = Fortran::evaluate::characteristics::DummyDataObject::Attr;
bool isValueAttr = false;
@@ -1171,6 +1177,22 @@ class Fortran::lower::CallInterfaceImpl {
attrs);
addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
} else if (obj.IsPassedByDescriptor(isBindC)) {
+ // bind(c) attributes(global): pass assumed-shape arrays by base address
+ // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+ // corank > 0 is already TODO'd above, so no need to re-check here.
+ using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+ constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
+ shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
+ ShapeAttr::AssumedRank};
+ if (isBindC && isCudaGlobalKernel &&
+ (obj.type.attrs() & shapeOnlyDescriptor).any() &&
+ !obj.type.type().IsPolymorphic()) {
+ mlir::Type passType = fir::ReferenceType::get(type);
+ addFirOperand(passType, nextPassedArgPosition(), Property::BaseAddress,
+ attrs);
+ addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+ return;
+ }
// Pass as fir.box or fir.class
addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs);
addPassedArg(PassEntityBy::Box, entity, characteristics);
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
new file mode 100644
index 0000000000000..0d4d48ae3571d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -0,0 +1,122 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test that bind(c) attributes(global) kernels with assumed-shape /
+! deferred-shape / assumed-rank array dummies are lowered to a raw base
+! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
+! C-interop ABI of these arguments must be a plain pointer.
+
+module m
+contains
+
+ ! ---- Positive case 1: assumed-shape, 1-D ----------------------------
+ subroutine call_kernel_1d(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine k1d(a, n) bind(c, name='k1d')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ integer, value :: n
+ end subroutine
+ end interface
+ call k1d<<<8, 128>>>(d, 1024)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_1d
+! CHECK: cuf.kernel_launch @k1d<<<{{.*}}>>>(%{{.*}}, %{{.*}}) : (!fir.ref<!fir.array<?xf32>>, i32)
+
+ ! ---- Positive case 2: assumed-shape, 2-D ----------------------------
+ subroutine call_kernel_2d(d)
+ real, device :: d(:,:)
+ interface
+ attributes(global) subroutine k2d(a) bind(c, name='k2d')
+ use iso_c_binding
+ real(c_float), dimension(:,:), device :: a
+ end subroutine
+ end interface
+ call k2d<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_2d
+! CHECK: cuf.kernel_launch @k2d<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?x?xf32>>)
+
+ ! ---- Positive case 3: grid_global -----------------------------------
+ subroutine call_kernel_grid_global(d)
+ real, device :: d(:)
+ interface
+ attributes(grid_global) subroutine kgg(a) bind(c, name='kgg')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: a
+ end subroutine
+ end interface
+ call kgg<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
+! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 1: attributes(global) without bind(c) -----------
+ ! Pure CUDA Fortran kernel; descriptor ABI preserved.
+ subroutine call_kernel_no_bindc(d)
+ real, device :: d(:)
+ interface
+ attributes(global) subroutine knobindc(a)
+ real, dimension(:), device :: a
+ end subroutine
+ end interface
+ call knobindc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_no_bindc
+! CHECK: cuf.kernel_launch @_QPknobindc<<<{{.*}}>>>(%{{.*}}) : (!fir.box<!fir.array<?xf32>>)
+
+ ! ---- Negative case 2: bind(c) but not attributes(global) ----------
+ ! Regular C-interop function (host); descriptor ABI preserved.
+ subroutine call_host_bindc(d)
+ real :: d(:)
+ interface
+ subroutine hostfn(a) bind(c, name='hostfn')
+ use iso_c_binding
+ real(c_float), dimension(:) :: a
+ end subroutine
+ end interface
+ call hostfn(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_host_bindc
+! CHECK: fir.call @hostfn(%{{.*}}) {{.*}}: (!fir.box<!fir.array<?xf32>>) -> ()
+
+ ! ---- Negative case 3: bind(c) attributes(global) + dimension(*) -----
+ ! Already a base-address ABI; just confirm it didn't regress.
+ subroutine call_kernel_assumed_size(d)
+ real, device :: d(*)
+ interface
+ attributes(global) subroutine kstar(a) bind(c, name='kstar')
+ use iso_c_binding
+ real(c_float), dimension(*), device :: a
+ end subroutine
+ end interface
+ call kstar<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_size
+! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+ ! ---- Negative case 4: allocatable assumed-shape -------------------
+ ! Allocatable is handled before the descriptor branch; remains MutableBox
+ ! at the kernel-launch site.
+ subroutine call_kernel_allocatable(d)
+ real, allocatable, device :: d(:)
+ interface
+ attributes(global) subroutine kalloc(a) bind(c, name='kalloc')
+ use iso_c_binding
+ real(c_float), dimension(:), allocatable, device :: a
+ end subroutine
+ end interface
+ call kalloc<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
+! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+end module
>From a00da371c511d0b5e155c0536f9ecdc8060e4969 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:40:23 -0700
Subject: [PATCH 2/6] update comment
---
flang/lib/Lower/CallInterface.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 9f57e72079577..14e45563ee96f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1179,7 +1179,6 @@ class Fortran::lower::CallInterfaceImpl {
} else if (obj.IsPassedByDescriptor(isBindC)) {
// bind(c) attributes(global): pass assumed-shape arrays by base address
// (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
- // corank > 0 is already TODO'd above, so no need to re-check here.
using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
>From 56822cc76940ef76922c0ef13a89e2c7a7ff2b1e Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Sat, 23 May 2026 10:06:51 -0700
Subject: [PATCH 3/6] add doc, add test
---
flang/docs/CUDA.md | 51 +++++++++++++++++++
flang/docs/index.md | 1 +
.../Lower/CUDA/cuda-kernel-bindc-shape.cuf | 37 +++++++++++++-
3 files changed, 87 insertions(+), 2 deletions(-)
create mode 100644 flang/docs/CUDA.md
diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
new file mode 100644
index 0000000000000..fdaaae6632e64
--- /dev/null
+++ b/flang/docs/CUDA.md
@@ -0,0 +1,51 @@
+<!--===- docs/CUDA.md
+
+ Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ See https://llvm.org/LICENSE.txt for license information.
+ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
+# CUDA Fortran lowering notes
+
+```{contents}
+---
+local:
+---
+```
+
+Catalog of CUDA Fortran lowering decisions in Flang that diverge from the
+Fortran 2018 standard, for cases the [CUDA Fortran Programming
+Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
+does not specify.
+
+## `BIND(C) ATTRIBUTES(GLOBAL)` shape-only dummy arrays
+
+For a `BIND(C)` procedure with `ATTRIBUTES(GLOBAL)` or
+`ATTRIBUTES(GRID_GLOBAL)`, dummies whose only descriptor requirement is shape
+(`AssumedShape`, `DeferredShape`, or `AssumedRank`) are passed by base address
+(`!fir.ref<T>`) instead of by `CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE`,
+`POINTER`, polymorphic, coarray, and `BIND(C)` assumed-length character dummies
+are unaffected and keep the standard Fortran 2018 lowering.
+
+```fortran
+interface
+ attributes(global) subroutine f(d, n) bind(c, name='f')
+ use iso_c_binding
+ real(c_float), dimension(:), device :: d
+ integer, value :: n
+ end subroutine
+end interface
+! interoperates with: extern "C" __global__ void f(float *d, int n);
+```
+
+Reason: Fortran 2018 ยง18.3.7 prescribes a CFI descriptor for these shape
+attributes under `BIND(C)`, but the
+[`cudaLaunchKernel`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html)
+ABI requires `args[i]` to point to a value of the type the C kernel declares
+for parameter `i`. A descriptor pointer in `args[0]` would be dereferenced as
+`T *` on the device, accessing host descriptor memory and producing illegal
+accesses.
+
+Implementation: `flang/lib/Lower/CallInterface.cpp` in `handleExplicitDummy`,
+gated on `procedure.cudaSubprogramAttrs` and the dummy's shape attributes.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 2e7150188c8d4..c300b8a215417 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -65,6 +65,7 @@ on how to get in touch with us and to learn more about the current status.
Character
ComplexOperations
ControlFlowGraph
+ CUDA
DebugGeneration
DoConcurrent
DoConcurrentConversionToOpenMP
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
index 0d4d48ae3571d..7f3687244cfdc 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -55,6 +55,23 @@ contains
! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+ ! ---- Positive case 4: assumed-rank ----------------------------------
+ ! AssumedRank shape attribute requires a descriptor under the standard
+ ! BIND(C) ABI; the override forces a base address for the CUDA-C ABI.
+ subroutine call_kernel_assumed_rank(d)
+ real, device :: d(..)
+ interface
+ attributes(global) subroutine krank(a) bind(c, name='krank')
+ use iso_c_binding
+ real(c_float), dimension(..), device :: a
+ end subroutine
+ end interface
+ call krank<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_rank
+! CHECK: cuf.kernel_launch @krank<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<{{.*}}>)
+
! ---- Negative case 1: attributes(global) without bind(c) -----------
! Pure CUDA Fortran kernel; descriptor ABI preserved.
subroutine call_kernel_no_bindc(d)
@@ -103,8 +120,8 @@ contains
! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
! ---- Negative case 4: allocatable assumed-shape -------------------
- ! Allocatable is handled before the descriptor branch; remains MutableBox
- ! at the kernel-launch site.
+ ! Allocatable goes through the MutableBox branch in handleExplicitDummy
+ ! before the override is reachable.
subroutine call_kernel_allocatable(d)
real, allocatable, device :: d(:)
interface
@@ -119,4 +136,20 @@ contains
! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+ ! ---- Negative case 5: pointer assumed-shape -----------------------
+ ! Pointer also goes through the MutableBox branch, same as allocatable.
+ subroutine call_kernel_pointer(d)
+ real, pointer, device :: d(:)
+ interface
+ attributes(global) subroutine kptr(a) bind(c, name='kptr')
+ use iso_c_binding
+ real(c_float), dimension(:), pointer, device :: a
+ end subroutine
+ end interface
+ call kptr<<<8, 128>>>(d)
+ end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_pointer
+! CHECK: cuf.kernel_launch @kptr<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+
end module
>From 80430e7b8f092d17d92d15a04c12b563167f3d72 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Sat, 23 May 2026 22:55:08 -0700
Subject: [PATCH 4/6] update doc and drop deferred shape
---
flang/docs/CUDA.md | 24 ++++++++++++------------
flang/lib/Lower/CallInterface.cpp | 8 ++++----
2 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
index fdaaae6632e64..73c38c28b78fc 100644
--- a/flang/docs/CUDA.md
+++ b/flang/docs/CUDA.md
@@ -14,19 +14,20 @@ local:
---
```
-Catalog of CUDA Fortran lowering decisions in Flang that diverge from the
+List of CUDA Fortran lowering decisions in Flang that diverge from the
Fortran 2018 standard, for cases the [CUDA Fortran Programming
Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
does not specify.
-## `BIND(C) ATTRIBUTES(GLOBAL)` shape-only dummy arrays
+## `BIND(C) ATTRIBUTES(GLOBAL)` assumed-shape and assumed-rank dummies
For a `BIND(C)` procedure with `ATTRIBUTES(GLOBAL)` or
-`ATTRIBUTES(GRID_GLOBAL)`, dummies whose only descriptor requirement is shape
-(`AssumedShape`, `DeferredShape`, or `AssumedRank`) are passed by base address
-(`!fir.ref<T>`) instead of by `CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE`,
-`POINTER`, polymorphic, coarray, and `BIND(C)` assumed-length character dummies
-are unaffected and keep the standard Fortran 2018 lowering.
+`ATTRIBUTES(GRID_GLOBAL)`, an assumed-shape (`dimension(:)`) or assumed-rank
+(`dimension(..)`) dummy is passed by base address (`!fir.ref<T>`) instead of by
+`CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE` and `POINTER` dummies take an
+earlier descriptor-of-mutable path and are unaffected. To deliver a CFI
+descriptor to the kernel, drop `BIND(C)`: a plain `ATTRIBUTES(GLOBAL)` kernel
+keeps the descriptor-passing lowering.
```fortran
interface
@@ -44,8 +45,7 @@ attributes under `BIND(C)`, but the
[`cudaLaunchKernel`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html)
ABI requires `args[i]` to point to a value of the type the C kernel declares
for parameter `i`. A descriptor pointer in `args[0]` would be dereferenced as
-`T *` on the device, accessing host descriptor memory and producing illegal
-accesses.
-
-Implementation: `flang/lib/Lower/CallInterface.cpp` in `handleExplicitDummy`,
-gated on `procedure.cudaSubprogramAttrs` and the dummy's shape attributes.
+`T *` on the device: the kernel reads descriptor metadata bytes (`base_addr`,
+`elem_len`, dim info, ...) as element data, producing wrong results, and when
+the descriptor resides in host memory the device load additionally faults with
+an illegal-access error.
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 14e45563ee96f..2a5f2de4d8d7b 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1177,12 +1177,12 @@ class Fortran::lower::CallInterfaceImpl {
attrs);
addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
} else if (obj.IsPassedByDescriptor(isBindC)) {
- // bind(c) attributes(global): pass assumed-shape arrays by base address
- // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+ // bind(c) attributes(global): pass assumed-shape/assumed-rank dummies by
+ // base address (cudaLaunchKernel cannot deliver a CFI descriptor to the
+ // device).
using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
- shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
- ShapeAttr::AssumedRank};
+ shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::AssumedRank};
if (isBindC && isCudaGlobalKernel &&
(obj.type.attrs() & shapeOnlyDescriptor).any() &&
!obj.type.type().IsPolymorphic()) {
>From 7ddc928276dd29cda2ad192972e4b2bb83ef44e7 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 26 May 2026 09:56:07 -0700
Subject: [PATCH 5/6] update doc & test comment
---
flang/docs/CUDA.md | 23 ++++++++++++-------
.../Lower/CUDA/cuda-kernel-bindc-shape.cuf | 6 ++---
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
index 73c38c28b78fc..ab67719ba248d 100644
--- a/flang/docs/CUDA.md
+++ b/flang/docs/CUDA.md
@@ -6,7 +6,7 @@
-->
-# CUDA Fortran lowering notes
+# CUDA Fortran
```{contents}
---
@@ -14,20 +14,27 @@ local:
---
```
-List of CUDA Fortran lowering decisions in Flang that diverge from the
-Fortran 2018 standard, for cases the [CUDA Fortran Programming
+Implementation notes for Flang's CUDA Fortran support.
+
+## Lowering decisions
+
+List of CUDA Fortran lowering decisions in Flang for cases where CUDA
+Fortran interoperability requires behavior that is not specified by the
+[CUDA Fortran Programming
Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
-does not specify.
+or by standard `BIND(C)` lowering alone.
-## `BIND(C) ATTRIBUTES(GLOBAL)` assumed-shape and assumed-rank dummies
+### `BIND(C) ATTRIBUTES(GLOBAL)` assumed-shape and assumed-rank dummies
For a `BIND(C)` procedure with `ATTRIBUTES(GLOBAL)` or
`ATTRIBUTES(GRID_GLOBAL)`, an assumed-shape (`dimension(:)`) or assumed-rank
(`dimension(..)`) dummy is passed by base address (`!fir.ref<T>`) instead of by
`CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE` and `POINTER` dummies take an
-earlier descriptor-of-mutable path and are unaffected. To deliver a CFI
-descriptor to the kernel, drop `BIND(C)`: a plain `ATTRIBUTES(GLOBAL)` kernel
-keeps the descriptor-passing lowering.
+earlier descriptor-of-mutable path and are unaffected. A kernel declared
+without `BIND(C)` (i.e., a plain `ATTRIBUTES(GLOBAL)` or
+`ATTRIBUTES(GRID_GLOBAL)` Fortran kernel) uses the standard
+descriptor-passing lowering instead, which is the form to use when the
+kernel needs to access a CFI descriptor.
```fortran
interface
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
index 7f3687244cfdc..8e8b0a7842c84 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -1,8 +1,8 @@
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
-! Test that bind(c) attributes(global) kernels with assumed-shape /
-! deferred-shape / assumed-rank array dummies are lowered to a raw base
-! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! Test that bind(c) attributes(global) kernels with assumed-shape or
+! assumed-rank array dummies are lowered to a raw base address
+! (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
! C-interop ABI of these arguments must be a plain pointer.
>From eb5a9002ff5318b446b65af39f5702d734646281 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 26 May 2026 10:59:19 -0700
Subject: [PATCH 6/6] update doc
---
flang/docs/CUDA.md | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
index ab67719ba248d..291ac24c5274f 100644
--- a/flang/docs/CUDA.md
+++ b/flang/docs/CUDA.md
@@ -21,8 +21,7 @@ Implementation notes for Flang's CUDA Fortran support.
List of CUDA Fortran lowering decisions in Flang for cases where CUDA
Fortran interoperability requires behavior that is not specified by the
[CUDA Fortran Programming
-Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
-or by standard `BIND(C)` lowering alone.
+Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html).
### `BIND(C) ATTRIBUTES(GLOBAL)` assumed-shape and assumed-rank dummies
@@ -56,3 +55,7 @@ for parameter `i`. A descriptor pointer in `args[0]` would be dereferenced as
`elem_len`, dim info, ...) as element data, producing wrong results, and when
the descriptor resides in host memory the device load additionally faults with
an illegal-access error.
+
+This also aligns with the reference compiler behavior, where
+`BIND(C) ATTRIBUTES(GLOBAL)` array arguments are lowered to raw device
+pointers regardless of array shape.
More information about the flang-commits
mailing list