[flang-commits] [flang] [flang][cuda] Pass assumed-shape arrays of bind(c) attributes(global) kernels by base address (PR #199313)

Zhen Wang via flang-commits flang-commits at lists.llvm.org
Sat May 23 10:23:55 PDT 2026


https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/199313

>From 21ddaedb3e1dae1b5938b4e0aa8f001b0b541302 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:32:16 -0700
Subject: [PATCH 1/3] Pass assumed-shape arrays of bind(c) attributes(global)
 kernels by base address

---
 flang/lib/Lower/CallInterface.cpp             |  26 +++-
 .../Lower/CUDA/cuda-kernel-bindc-shape.cuf    | 122 ++++++++++++++++++
 2 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf

diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e9059581c690a..9f57e72079577 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -873,6 +873,12 @@ class Fortran::lower::CallInterfaceImpl {
   void buildExplicitInterface(
       const Fortran::evaluate::characteristics::Procedure &procedure) {
     bool isBindC = procedure.IsBindC();
+    bool isCudaGlobalKernel =
+        procedure.cudaSubprogramAttrs &&
+        (*procedure.cudaSubprogramAttrs ==
+             Fortran::common::CUDASubprogramAttrs::Global ||
+         *procedure.cudaSubprogramAttrs ==
+             Fortran::common::CUDASubprogramAttrs::Grid_Global);
     // Handle result
     if (const std::optional<Fortran::evaluate::characteristics::FunctionResult>
             &result = procedure.functionResult) {
@@ -899,7 +905,7 @@ class Fortran::lower::CallInterfaceImpl {
                   handleImplicitDummy(&argCharacteristics, dummy, entity);
                 else
                   handleExplicitDummy(&argCharacteristics, dummy, entity,
-                                      isBindC);
+                                      isBindC, isCudaGlobalKernel);
               },
               [&](const Fortran::evaluate::characteristics::DummyProcedure
                       &dummy) {
@@ -1111,7 +1117,7 @@ class Fortran::lower::CallInterfaceImpl {
   void handleExplicitDummy(
       const DummyCharacteristics *characteristics,
       const Fortran::evaluate::characteristics::DummyDataObject &obj,
-      const FortranEntity &entity, bool isBindC) {
+      const FortranEntity &entity, bool isBindC, bool isCudaGlobalKernel) {
     using Attrs = Fortran::evaluate::characteristics::DummyDataObject::Attr;
 
     bool isValueAttr = false;
@@ -1171,6 +1177,22 @@ class Fortran::lower::CallInterfaceImpl {
                     attrs);
       addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
     } else if (obj.IsPassedByDescriptor(isBindC)) {
+      // bind(c) attributes(global): pass assumed-shape arrays by base address
+      // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+      // corank > 0 is already TODO'd above, so no need to re-check here.
+      using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+      constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
+          shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
+                              ShapeAttr::AssumedRank};
+      if (isBindC && isCudaGlobalKernel &&
+          (obj.type.attrs() & shapeOnlyDescriptor).any() &&
+          !obj.type.type().IsPolymorphic()) {
+        mlir::Type passType = fir::ReferenceType::get(type);
+        addFirOperand(passType, nextPassedArgPosition(), Property::BaseAddress,
+                      attrs);
+        addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+        return;
+      }
       // Pass as fir.box or fir.class
       addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs);
       addPassedArg(PassEntityBy::Box, entity, characteristics);
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
new file mode 100644
index 0000000000000..0d4d48ae3571d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -0,0 +1,122 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test that bind(c) attributes(global) kernels with assumed-shape /
+! deferred-shape / assumed-rank array dummies are lowered to a raw base
+! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
+! C-interop ABI of these arguments must be a plain pointer.
+
+module m
+contains
+
+  ! ---- Positive case 1: assumed-shape, 1-D ----------------------------
+  subroutine call_kernel_1d(d)
+    real, device :: d(:)
+    interface
+      attributes(global) subroutine k1d(a, n) bind(c, name='k1d')
+        use iso_c_binding
+        real(c_float), dimension(:), device :: a
+        integer, value :: n
+      end subroutine
+    end interface
+    call k1d<<<8, 128>>>(d, 1024)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_1d
+! CHECK: cuf.kernel_launch @k1d<<<{{.*}}>>>(%{{.*}}, %{{.*}}) : (!fir.ref<!fir.array<?xf32>>, i32)
+
+  ! ---- Positive case 2: assumed-shape, 2-D ----------------------------
+  subroutine call_kernel_2d(d)
+    real, device :: d(:,:)
+    interface
+      attributes(global) subroutine k2d(a) bind(c, name='k2d')
+        use iso_c_binding
+        real(c_float), dimension(:,:), device :: a
+      end subroutine
+    end interface
+    call k2d<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_2d
+! CHECK: cuf.kernel_launch @k2d<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?x?xf32>>)
+
+  ! ---- Positive case 3: grid_global -----------------------------------
+  subroutine call_kernel_grid_global(d)
+    real, device :: d(:)
+    interface
+      attributes(grid_global) subroutine kgg(a) bind(c, name='kgg')
+        use iso_c_binding
+        real(c_float), dimension(:), device :: a
+      end subroutine
+    end interface
+    call kgg<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
+! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+  ! ---- Negative case 1: attributes(global) without bind(c) -----------
+  ! Pure CUDA Fortran kernel; descriptor ABI preserved.
+  subroutine call_kernel_no_bindc(d)
+    real, device :: d(:)
+    interface
+      attributes(global) subroutine knobindc(a)
+        real, dimension(:), device :: a
+      end subroutine
+    end interface
+    call knobindc<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_no_bindc
+! CHECK: cuf.kernel_launch @_QPknobindc<<<{{.*}}>>>(%{{.*}}) : (!fir.box<!fir.array<?xf32>>)
+
+  ! ---- Negative case 2: bind(c) but not attributes(global) ----------
+  ! Regular C-interop function (host); descriptor ABI preserved.
+  subroutine call_host_bindc(d)
+    real :: d(:)
+    interface
+      subroutine hostfn(a) bind(c, name='hostfn')
+        use iso_c_binding
+        real(c_float), dimension(:) :: a
+      end subroutine
+    end interface
+    call hostfn(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_host_bindc
+! CHECK: fir.call @hostfn(%{{.*}}) {{.*}}: (!fir.box<!fir.array<?xf32>>) -> ()
+
+  ! ---- Negative case 3: bind(c) attributes(global) + dimension(*) -----
+  ! Already a base-address ABI; just confirm it didn't regress.
+  subroutine call_kernel_assumed_size(d)
+    real, device :: d(*)
+    interface
+      attributes(global) subroutine kstar(a) bind(c, name='kstar')
+        use iso_c_binding
+        real(c_float), dimension(*), device :: a
+      end subroutine
+    end interface
+    call kstar<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_size
+! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+  ! ---- Negative case 4: allocatable assumed-shape -------------------
+  ! Allocatable is handled before the descriptor branch; remains MutableBox
+  ! at the kernel-launch site.
+  subroutine call_kernel_allocatable(d)
+    real, allocatable, device :: d(:)
+    interface
+      attributes(global) subroutine kalloc(a) bind(c, name='kalloc')
+        use iso_c_binding
+        real(c_float), dimension(:), allocatable, device :: a
+      end subroutine
+    end interface
+    call kalloc<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
+! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+end module

>From a00da371c511d0b5e155c0536f9ecdc8060e4969 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:40:23 -0700
Subject: [PATCH 2/3] update comment

---
 flang/lib/Lower/CallInterface.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 9f57e72079577..14e45563ee96f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1179,7 +1179,6 @@ class Fortran::lower::CallInterfaceImpl {
     } else if (obj.IsPassedByDescriptor(isBindC)) {
       // bind(c) attributes(global): pass assumed-shape arrays by base address
       // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
-      // corank > 0 is already TODO'd above, so no need to re-check here.
       using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
       constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
           shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,

>From 56822cc76940ef76922c0ef13a89e2c7a7ff2b1e Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Sat, 23 May 2026 10:06:51 -0700
Subject: [PATCH 3/3] add doc, add test

---
 flang/docs/CUDA.md                            | 51 +++++++++++++++++++
 flang/docs/index.md                           |  1 +
 .../Lower/CUDA/cuda-kernel-bindc-shape.cuf    | 37 +++++++++++++-
 3 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 flang/docs/CUDA.md

diff --git a/flang/docs/CUDA.md b/flang/docs/CUDA.md
new file mode 100644
index 0000000000000..fdaaae6632e64
--- /dev/null
+++ b/flang/docs/CUDA.md
@@ -0,0 +1,51 @@
+<!--===- docs/CUDA.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
+# CUDA Fortran lowering notes
+
+```{contents}
+---
+local:
+---
+```
+
+Catalog of CUDA Fortran lowering decisions in Flang that diverge from the
+Fortran 2018 standard, for cases the [CUDA Fortran Programming
+Guide](https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html)
+does not specify.
+
+## `BIND(C) ATTRIBUTES(GLOBAL)` shape-only dummy arrays
+
+For a `BIND(C)` procedure with `ATTRIBUTES(GLOBAL)` or
+`ATTRIBUTES(GRID_GLOBAL)`, dummies whose only descriptor requirement is shape
+(`AssumedShape`, `DeferredShape`, or `AssumedRank`) are passed by base address
+(`!fir.ref<T>`) instead of by `CFI_cdesc_t *` (`!fir.box<T>`). `ALLOCATABLE`,
+`POINTER`, polymorphic, coarray, and `BIND(C)` assumed-length character dummies
+are unaffected and keep the standard Fortran 2018 lowering.
+
+```fortran
+interface
+  attributes(global) subroutine f(d, n) bind(c, name='f')
+    use iso_c_binding
+    real(c_float), dimension(:), device :: d
+    integer, value :: n
+  end subroutine
+end interface
+! interoperates with: extern "C" __global__ void f(float *d, int n);
+```
+
+Reason: Fortran 2018 ยง18.3.7 prescribes a CFI descriptor for these shape
+attributes under `BIND(C)`, but the
+[`cudaLaunchKernel`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html)
+ABI requires `args[i]` to point to a value of the type the C kernel declares
+for parameter `i`. A descriptor pointer in `args[0]` would be dereferenced as
+`T *` on the device, accessing host descriptor memory and producing illegal
+accesses.
+
+Implementation: `flang/lib/Lower/CallInterface.cpp` in `handleExplicitDummy`,
+gated on `procedure.cudaSubprogramAttrs` and the dummy's shape attributes.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 2e7150188c8d4..c300b8a215417 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -65,6 +65,7 @@ on how to get in touch with us and to learn more about the current status.
    Character
    ComplexOperations
    ControlFlowGraph
+   CUDA
    DebugGeneration
    DoConcurrent
    DoConcurrentConversionToOpenMP
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
index 0d4d48ae3571d..7f3687244cfdc 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -55,6 +55,23 @@ contains
 ! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
 ! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
 
+  ! ---- Positive case 4: assumed-rank ----------------------------------
+  ! AssumedRank shape attribute requires a descriptor under the standard
+  ! BIND(C) ABI; the override forces a base address for the CUDA-C ABI.
+  subroutine call_kernel_assumed_rank(d)
+    real, device :: d(..)
+    interface
+      attributes(global) subroutine krank(a) bind(c, name='krank')
+        use iso_c_binding
+        real(c_float), dimension(..), device :: a
+      end subroutine
+    end interface
+    call krank<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_rank
+! CHECK: cuf.kernel_launch @krank<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<{{.*}}>)
+
   ! ---- Negative case 1: attributes(global) without bind(c) -----------
   ! Pure CUDA Fortran kernel; descriptor ABI preserved.
   subroutine call_kernel_no_bindc(d)
@@ -103,8 +120,8 @@ contains
 ! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
 
   ! ---- Negative case 4: allocatable assumed-shape -------------------
-  ! Allocatable is handled before the descriptor branch; remains MutableBox
-  ! at the kernel-launch site.
+  ! Allocatable goes through the MutableBox branch in handleExplicitDummy
+  ! before the override is reachable.
   subroutine call_kernel_allocatable(d)
     real, allocatable, device :: d(:)
     interface
@@ -119,4 +136,20 @@ contains
 ! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
 ! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
+  ! ---- Negative case 5: pointer assumed-shape -----------------------
+  ! Pointer also goes through the MutableBox branch, same as allocatable.
+  subroutine call_kernel_pointer(d)
+    real, pointer, device :: d(:)
+    interface
+      attributes(global) subroutine kptr(a) bind(c, name='kptr')
+        use iso_c_binding
+        real(c_float), dimension(:), pointer, device :: a
+      end subroutine
+    end interface
+    call kptr<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_pointer
+! CHECK: cuf.kernel_launch @kptr<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+
 end module



More information about the flang-commits mailing list