[flang-commits] [flang] [flang][cuda] Pass assumed-shape arrays of bind(c) attributes(global) kernels by base address (PR #199313)

Zhen Wang via flang-commits flang-commits at lists.llvm.org
Fri May 22 20:40:56 PDT 2026


https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/199313

For a `bind(c) attributes(global)` kernel with an assumed-shape array dummy, the dummy is lowered as a CFI descriptor (`!fir.box<...>`). `cudaLaunchKernel` cannot deliver a descriptor to the device, so the C `__global__` reads a pointer to a 48-byte descriptor as `float *` and writes garbage.
Pass these dummies as a base address (`!fir.ref<!fir.array<?xT>>`) instead, matching what `dimension(*)` already does. Allocatable / pointer / polymorphic are unaffected.

Before — for `mykernel(d_arr, n) bind(c) attributes(global)` taking `real(c_float), dimension(:), device :: d_arr`:

```
cuf.kernel_launch @mykernel<<<...>>>(%box, %n) : (!fir.box<!fir.array<?xf32>>, i32)
```
After:
```
%data = fir.box_addr %src : (!fir.box<...>) -> !fir.ref<!fir.array<?xf32>>
cuf.kernel_launch @mykernel<<<...>>>(%data, %n) : (!fir.ref<!fir.array<?xf32>>, i32)
```

>From 21ddaedb3e1dae1b5938b4e0aa8f001b0b541302 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:32:16 -0700
Subject: [PATCH 1/2] Pass assumed-shape arrays of bind(c) attributes(global)
 kernels by base address

---
 flang/lib/Lower/CallInterface.cpp             |  26 +++-
 .../Lower/CUDA/cuda-kernel-bindc-shape.cuf    | 122 ++++++++++++++++++
 2 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf

diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e9059581c690a..9f57e72079577 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -873,6 +873,12 @@ class Fortran::lower::CallInterfaceImpl {
   void buildExplicitInterface(
       const Fortran::evaluate::characteristics::Procedure &procedure) {
     bool isBindC = procedure.IsBindC();
+    bool isCudaGlobalKernel =
+        procedure.cudaSubprogramAttrs &&
+        (*procedure.cudaSubprogramAttrs ==
+             Fortran::common::CUDASubprogramAttrs::Global ||
+         *procedure.cudaSubprogramAttrs ==
+             Fortran::common::CUDASubprogramAttrs::Grid_Global);
     // Handle result
     if (const std::optional<Fortran::evaluate::characteristics::FunctionResult>
             &result = procedure.functionResult) {
@@ -899,7 +905,7 @@ class Fortran::lower::CallInterfaceImpl {
                   handleImplicitDummy(&argCharacteristics, dummy, entity);
                 else
                   handleExplicitDummy(&argCharacteristics, dummy, entity,
-                                      isBindC);
+                                      isBindC, isCudaGlobalKernel);
               },
               [&](const Fortran::evaluate::characteristics::DummyProcedure
                       &dummy) {
@@ -1111,7 +1117,7 @@ class Fortran::lower::CallInterfaceImpl {
   void handleExplicitDummy(
       const DummyCharacteristics *characteristics,
       const Fortran::evaluate::characteristics::DummyDataObject &obj,
-      const FortranEntity &entity, bool isBindC) {
+      const FortranEntity &entity, bool isBindC, bool isCudaGlobalKernel) {
     using Attrs = Fortran::evaluate::characteristics::DummyDataObject::Attr;
 
     bool isValueAttr = false;
@@ -1171,6 +1177,22 @@ class Fortran::lower::CallInterfaceImpl {
                     attrs);
       addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
     } else if (obj.IsPassedByDescriptor(isBindC)) {
+      // bind(c) attributes(global): pass assumed-shape arrays by base address
+      // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
+      // corank > 0 is already TODO'd above, so no need to re-check here.
+      using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+      constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
+          shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
+                              ShapeAttr::AssumedRank};
+      if (isBindC && isCudaGlobalKernel &&
+          (obj.type.attrs() & shapeOnlyDescriptor).any() &&
+          !obj.type.type().IsPolymorphic()) {
+        mlir::Type passType = fir::ReferenceType::get(type);
+        addFirOperand(passType, nextPassedArgPosition(), Property::BaseAddress,
+                      attrs);
+        addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+        return;
+      }
       // Pass as fir.box or fir.class
       addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs);
       addPassedArg(PassEntityBy::Box, entity, characteristics);
diff --git a/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
new file mode 100644
index 0000000000000..0d4d48ae3571d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-bindc-shape.cuf
@@ -0,0 +1,122 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test that bind(c) attributes(global) kernels with assumed-shape /
+! deferred-shape / assumed-rank array dummies are lowered to a raw base
+! address (!fir.ref<...>) instead of a CFI descriptor (!fir.box<...>).
+! cudaLaunchKernel cannot deliver a CFI descriptor to the device, so the
+! C-interop ABI of these arguments must be a plain pointer.
+
+module m
+contains
+
+  ! ---- Positive case 1: assumed-shape, 1-D ----------------------------
+  subroutine call_kernel_1d(d)
+    real, device :: d(:)
+    interface
+      attributes(global) subroutine k1d(a, n) bind(c, name='k1d')
+        use iso_c_binding
+        real(c_float), dimension(:), device :: a
+        integer, value :: n
+      end subroutine
+    end interface
+    call k1d<<<8, 128>>>(d, 1024)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_1d
+! CHECK: cuf.kernel_launch @k1d<<<{{.*}}>>>(%{{.*}}, %{{.*}}) : (!fir.ref<!fir.array<?xf32>>, i32)
+
+  ! ---- Positive case 2: assumed-shape, 2-D ----------------------------
+  subroutine call_kernel_2d(d)
+    real, device :: d(:,:)
+    interface
+      attributes(global) subroutine k2d(a) bind(c, name='k2d')
+        use iso_c_binding
+        real(c_float), dimension(:,:), device :: a
+      end subroutine
+    end interface
+    call k2d<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_2d
+! CHECK: cuf.kernel_launch @k2d<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?x?xf32>>)
+
+  ! ---- Positive case 3: grid_global -----------------------------------
+  subroutine call_kernel_grid_global(d)
+    real, device :: d(:)
+    interface
+      attributes(grid_global) subroutine kgg(a) bind(c, name='kgg')
+        use iso_c_binding
+        real(c_float), dimension(:), device :: a
+      end subroutine
+    end interface
+    call kgg<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_grid_global
+! CHECK: cuf.kernel_launch @kgg<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+  ! ---- Negative case 1: attributes(global) without bind(c) -----------
+  ! Pure CUDA Fortran kernel; descriptor ABI preserved.
+  subroutine call_kernel_no_bindc(d)
+    real, device :: d(:)
+    interface
+      attributes(global) subroutine knobindc(a)
+        real, dimension(:), device :: a
+      end subroutine
+    end interface
+    call knobindc<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_no_bindc
+! CHECK: cuf.kernel_launch @_QPknobindc<<<{{.*}}>>>(%{{.*}}) : (!fir.box<!fir.array<?xf32>>)
+
+  ! ---- Negative case 2: bind(c) but not attributes(global) ----------
+  ! Regular C-interop function (host); descriptor ABI preserved.
+  subroutine call_host_bindc(d)
+    real :: d(:)
+    interface
+      subroutine hostfn(a) bind(c, name='hostfn')
+        use iso_c_binding
+        real(c_float), dimension(:) :: a
+      end subroutine
+    end interface
+    call hostfn(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_host_bindc
+! CHECK: fir.call @hostfn(%{{.*}}) {{.*}}: (!fir.box<!fir.array<?xf32>>) -> ()
+
+  ! ---- Negative case 3: bind(c) attributes(global) + dimension(*) -----
+  ! Already a base-address ABI; just confirm it didn't regress.
+  subroutine call_kernel_assumed_size(d)
+    real, device :: d(*)
+    interface
+      attributes(global) subroutine kstar(a) bind(c, name='kstar')
+        use iso_c_binding
+        real(c_float), dimension(*), device :: a
+      end subroutine
+    end interface
+    call kstar<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_assumed_size
+! CHECK: cuf.kernel_launch @kstar<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.array<?xf32>>)
+
+  ! ---- Negative case 4: allocatable assumed-shape -------------------
+  ! Allocatable is handled before the descriptor branch; remains MutableBox
+  ! at the kernel-launch site.
+  subroutine call_kernel_allocatable(d)
+    real, allocatable, device :: d(:)
+    interface
+      attributes(global) subroutine kalloc(a) bind(c, name='kalloc')
+        use iso_c_binding
+        real(c_float), dimension(:), allocatable, device :: a
+      end subroutine
+    end interface
+    call kalloc<<<8, 128>>>(d)
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMmPcall_kernel_allocatable
+! CHECK: cuf.kernel_launch @kalloc<<<{{.*}}>>>(%{{.*}}) : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+end module

>From a00da371c511d0b5e155c0536f9ecdc8060e4969 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 22 May 2026 20:40:23 -0700
Subject: [PATCH 2/2] update comment

---
 flang/lib/Lower/CallInterface.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 9f57e72079577..14e45563ee96f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1179,7 +1179,6 @@ class Fortran::lower::CallInterfaceImpl {
     } else if (obj.IsPassedByDescriptor(isBindC)) {
       // bind(c) attributes(global): pass assumed-shape arrays by base address
       // (cudaLaunchKernel cannot deliver a CFI descriptor to the device).
-      // corank > 0 is already TODO'd above, so no need to re-check here.
       using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
       constexpr Fortran::evaluate::characteristics::TypeAndShape::Attrs
           shapeOnlyDescriptor{ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,



More information about the flang-commits mailing list