[flang-commits] [flang] [flang][cuda] Add CUFDeviceFuncTransform pass (PR #174487)

Mon Jan 5 13:47:20 PST 2026

https://github.com/clementval created https://github.com/llvm/llvm-project/pull/174487

This pass transform CUDA Fortran func.func to gpu.func and put them in the gpu.module. 

>From fca025148d41f1f7d6e0bcbc1c4309a4d1f83e67 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 5 Jan 2026 13:33:51 -0800
Subject: [PATCH] [flang][cuda] Add CUFDeviceFuncTransform pass

---
 .../flang/Optimizer/Transforms/Passes.td      |  10 +
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../CUDA/CUFDeviceFuncTransform.cpp           | 248 ++++++++++++++++++
 .../CUDA/cuda-device-func-transform-cc90.mlir |   8 +
 .../Fir/CUDA/cuda-device-func-transform.mlir  | 165 ++++++++++++
 5 files changed, 432 insertions(+)
 create mode 100644 flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
 create mode 100644 flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
 create mode 100644 flang/test/Fir/CUDA/cuda-device-func-transform.mlir

diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index a2a3341bfa667..47ffc4be93b33 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -523,6 +523,16 @@ def CUFComputeSharedMemoryOffsetsAndSize
   ];
 }
 
+def CUFDeviceFuncTransform
+    : Pass<"cuf-transform-device-func", "::mlir::ModuleOp"> {
+  let summary = "Transform device function to GPU func";
+  let dependentDialects = ["mlir::gpu::GPUDialect",
+                           "mlir::cf::ControlFlowDialect",
+                           "mlir::NVVM::NVVMDialect"];
+  let options = [Option<"computeCap", "compute-capability", "int",
+                        /*default=*/"0", "CUDA compute capability version">];
+}
+
 def CUFLaunchAttachAttr : Pass<"cuf-launch-attach-attr", ""> {
   let summary = "Attach CUDA attribute to CUF kernel generated launch";
   let description = [{
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 1e2d6e8f15e92..4496e80aa7c40 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_flang_library(FIRTransforms
   ConstantArgumentGlobalisation.cpp
   ControlFlowConverter.cpp
   CUDA/CUFAllocationConversion.cpp
+  CUDA/CUFDeviceFuncTransform.cpp
   CUDA/CUFLaunchAttachAttr.cpp
   CUDA/CUFPredefinedVarToGPU.cpp
   CUFAddConstructor.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
new file mode 100644
index 0000000000000..4be174fe7a4ea
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
@@ -0,0 +1,248 @@
+//===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+
+namespace {
+
+class CUFDeviceFuncTransform
+    : public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
+  using CUFDeviceFuncTransformBase<
+      CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;
+
+  static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
+                                        bool isGlobal, int computeCap) {
+    mlir::OpBuilder builder(funcOp.getContext());
+
+    mlir::Region &funcOpBody = funcOp.getBody();
+    SetVector<Value> operands;
+    for (mlir::Value operand : funcOp.getArguments())
+      operands.insert(operand);
+
+    llvm::SmallVector<mlir::Type> funcOperandTypes;
+    llvm::SmallVector<mlir::Type> funcResultTypes;
+    funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
+    funcResultTypes.reserve(funcOp.getResultTypes().size());
+    for (mlir::Type opTy : funcOp.getArgumentTypes())
+      funcOperandTypes.push_back(opTy);
+    for (mlir::Type resTy : funcOp.getResultTypes())
+      funcResultTypes.push_back(resTy);
+
+    mlir::Location loc = funcOp.getLoc();
+
+    mlir::FunctionType type = mlir::FunctionType::get(
+        funcOp.getContext(), funcOperandTypes, funcResultTypes);
+
+    auto deviceFuncOp =
+        gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
+                               mlir::TypeRange{}, mlir::TypeRange{});
+    if (isGlobal)
+      deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                            builder.getUnitAttr());
+
+    mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
+    mlir::Block &entryBlock = deviceFuncBody.front();
+
+    mlir::IRMapping map;
+    for (const auto &operand : enumerate(operands))
+      map.map(operand.value(), entryBlock.getArgument(operand.index()));
+
+    funcOpBody.cloneInto(&deviceFuncBody, map);
+
+    deviceFuncOp.walk([](func::ReturnOp op) {
+      mlir::OpBuilder replacer(op);
+      gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
+      gpuReturnOp->setOperands(op.getOperands());
+      op.erase();
+    });
+
+    mlir::Block &funcOpEntry = funcOp.front();
+    mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);
+
+    entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+                                      clonedFuncOpEntry->getOperations());
+    clonedFuncOpEntry->erase();
+
+    auto launchBoundsAttr =
+        funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
+            cuf::getLaunchBoundsAttrName());
+    if (launchBoundsAttr) {
+      auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
+      auto maxntid =
+          builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
+      deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
+      deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
+                            launchBoundsAttr.getMinBPM());
+      if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
+        deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
+                              launchBoundsAttr.getUpperBoundClusterSize());
+    }
+
+    return deviceFuncOp;
+  }
+
+  static void createHostStub(mlir::func::FuncOp funcOp,
+                             mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
+    mlir::Location loc = funcOp.getLoc();
+    mlir::OpBuilder modBuilder(mod.getBodyRegion());
+    modBuilder.setInsertionPointToEnd(mod.getBody());
+    auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
+                                          funcOp.getFunctionType());
+    emptyStub.setVisibility(funcOp.getVisibility());
+    emptyStub->setAttrs(funcOp->getAttrs());
+    auto entryBlock = emptyStub.addEntryBlock();
+    modBuilder.setInsertionPointToEnd(entryBlock);
+    func::ReturnOp::create(modBuilder, loc);
+
+    symTab.erase(funcOp);
+    symTab.insert(emptyStub);
+  }
+
+  static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
+    if (auto cudaProcAttr =
+            funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+                cuf::getProcAttrName()))
+      if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
+          cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
+          cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
+          cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
+        return true;
+    return false;
+  }
+
+  void runOnOperation() override {
+    // Working on Module operation because inserting/removing function from the
+    // module is not thread-safe.
+    ModuleOp mod = getOperation();
+    mlir::SymbolTable symbolTable(getOperation());
+
+    auto *ctx = getOperation().getContext();
+    mlir::OpBuilder builder(ctx);
+
+    gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
+    mlir::SymbolTable gpuModSymTab(gpuMod);
+
+    llvm::SetVector<mlir::func::FuncOp> funcsToClone;
+    llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
+    llvm::SetVector<mlir::func::FuncOp> keepInModule;
+    llvm::StringSet<> deviceFuncNames;
+
+    // Look for all function to migrate to the GPU module.
+    mod.walk([&](mlir::func::FuncOp op) {
+      if (isDeviceFunc(op)) {
+        deviceFuncs.insert(op);
+        deviceFuncNames.insert(op.getSymName());
+      }
+    });
+
+    auto processCallOp = [&](fir::CallOp op) {
+      if (op.getCallee()) {
+        auto func = symbolTable.lookup<mlir::func::FuncOp>(
+            op.getCallee()->getLeafReference());
+        if (deviceFuncs.count(func) == 0)
+          funcsToClone.insert(func);
+      }
+    };
+
+    // Gather all function called by device functions.
+    for (auto funcOp : deviceFuncs) {
+      funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
+      funcOp.walk([&](fir::DispatchOp op) {
+        TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
+                          "in device procedure");
+      });
+    }
+
+    // Functions that are referenced in a derived-type binding table must be
+    // kept in the host module to avoid LLVM dialect verification errors.
+    for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
+      if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
+        globalOp.walk([&](fir::AddrOfOp addrOfOp) {
+          if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
+            keepInModule.insert(
+                *llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
+                  return f.getSymName() ==
+                         addrOfOp.getSymbol().getLeafReference();
+                }));
+        });
+      }
+    }
+
+    // Gather all functions called by CUF kernels.
+    mod.walk([&](cuf::KernelOp kernelOp) {
+      kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
+      kernelOp.walk([&](fir::DispatchOp op) {
+        TODO(op.getLoc(),
+             "type-bound procedure call with dynamic dispatch in cuf kernel");
+      });
+    });
+
+    for (auto funcOp : funcsToClone)
+      gpuModSymTab.insert(funcOp->clone());
+
+    for (auto funcOp : deviceFuncs) {
+      auto cudaProcAttr =
+          funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+              cuf::getProcAttrName());
+      auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
+                      cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      if (funcOp.isDeclaration()) {
+        mlir::Operation *clonedFuncOp = funcOp->clone();
+        if (isGlobal) {
+          clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                                builder.getUnitAttr());
+          clonedFuncOp->removeAttr(cuf::getProcAttrName());
+        }
+        gpuModSymTab.insert(clonedFuncOp);
+      } else {
+        gpu::GPUFuncOp deviceFuncOp =
+            createGPUFuncOp(funcOp, isGlobal, computeCap);
+        gpuModSymTab.insert(deviceFuncOp);
+
+        if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
+          // If the function is a global, we need to keep the host side
+          // declaration for the kernel registration. Currently we just
+          // erase its body but in the future, the body should be rewritten
+          // to be able to launch CUDA Fortran kernel from C code.
+          if (isGlobal || keepInModule.contains(funcOp))
+            createHostStub(funcOp, symbolTable, mod);
+          else
+            funcOp.erase();
+        }
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
diff --git a/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir b/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
new file mode 100644
index 0000000000000..793388fc9dd5d
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
@@ -0,0 +1,8 @@
+// RUN: fir-opt --split-input-file --cuf-transform-device-func="compute-capability=90" %s | FileCheck %s
+
+func.func @_QPsub_maxtnid() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 256 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %cst = arith.constant 2.000000e+00 : f32
+  return
+}
+
+// CHECK: gpu.func @_QPsub_maxtnid() kernel attributes {nvvm.cluster_max_blocks = 3 : i64, nvvm.maxntid = array<i32: 256, 1, 1>, nvvm.minctasm = 2 : i64}
diff --git a/flang/test/Fir/CUDA/cuda-device-func-transform.mlir b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
new file mode 100644
index 0000000000000..1ce21aeecffe5
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
@@ -0,0 +1,165 @@
+// RUN: fir-opt --split-input-file --cuf-transform-device-func %s | FileCheck %s
+
+func.func @_QPsub_device1() attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+  return
+}
+
+func.func @_QPsub_device2(%arg0: !fir.ref<f32> {fir.bindc_name = "i", cuf.proc_attr = #cuf.cuda_proc<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+  %0 = fir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %cst = arith.constant 2.000000e+00 : f32
+  fir.store %cst to %0 : !fir.ref<f32>
+  return
+}
+
+func.func @_QPsub_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %cst = arith.constant 2.000000e+00 : f32
+  return
+}
+
+func.func @_QPsub_host_device1() attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>} {
+  return
+}
+
+func.func private @_QMmod1Psub1(!fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+
+// CHECK-LABEL:  module attributes {gpu.container_module}
+
+// CHECK-NOT: func.func @_QPsub_device1()
+
+// CHECK-NOT: func.func @_QPsub_device2(%arg0: !fir.ref<f32> {fir.bindc_name = "i", cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
+
+// CHECK: func.func @_QPsub_host_device1()
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+
+// CHECK: gpu.func @_QPsub_device1()
+
+// CHECK: gpu.func @_QPsub_device2(%[[ARG0:.*]]: !fir.ref<f32>) {
+// CHECK:   %[[DECL:.*]] = fir.declare %[[ARG0]] {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:   %[[CST:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK:   fir.store %[[CST]] to %[[DECL]] : !fir.ref<f32>
+// CHECK:   gpu.return
+// CHECK: }
+
+// CHECK: gpu.func @_QPsub_global1() kernel
+
+// CHECK: gpu.func @_QPsub_host_device1()
+
+// CHECK: func.func private @_QMmod1Psub1(!fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>}) attributes {gpu.kernel}
+
+// CHECK: func.func @_QPsub_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+// CHECK-NEXT: return
+
+// -----
+
+func.func @_QPdevsub() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+  %c1_i32 = arith.constant 1 : i32
+  return %c1_i32 : i32
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+
+// CHECK: gpu.func @_QPdevsub() -> i32
+
+// CHECK: gpu.return %{{.*}} : i32
+
+// -----
+
+func.func @hostFuncUsedInDevice() {
+  return
+}
+
+func.func @_QPsub_device4() attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+  fir.call @hostFuncUsedInDevice() : () -> ()
+  return
+}
+
+// CHECK-LABEL: module attributes {gpu.container_module}
+// CHECK: func.func @hostFuncUsedInDevice()
+// CHECK: gpu.module @cuda_device_mod
+// CHECK: func.func @hostFuncUsedInDevice()
+// CHECK: gpu.func @_QPsub_device4()
+// CHECK: fir.call @hostFuncUsedInDevice() : () -> ()
+
+// -----
+
+func.func @_QPsub_grid_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+  %cst = arith.constant 2.000000e+00 : f32
+  return
+}
+
+// CHEC-LABEL: gpu.module @cuda_device_mod {
+// CHECK: gpu.func @_QPsub_grid_global1() kernel
+
+// CHECK-LABEL: func.func @_QPsub_grid_global1()
+
+// -----
+
+func.func @hostFuncUsedInDevice() {
+  return
+}
+
+func.func @_QPsub_host() {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg0 : index) = (%c1 : index) to (%c1 : index)  step (%c1 : index) {
+    fir.call @hostFuncUsedInDevice() : () -> ()
+    "fir.end"() : () -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @hostFuncUsedInDevice()
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: func.func @hostFuncUsedInDevice()
+
+// -----
+
+func.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "n"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %c2_i32 = arith.constant 2 : i32
+  %c10_i32 = arith.constant 10 : i32
+  %c0 = arith.constant 0 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32
+  fir.store %arg1 to %1 : !fir.ref<i32>
+  %2 = fir.declare %1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in, value>, uniq_name = "_QFpartialsumshflshflr8En"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  %9 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFpartialsumshflshflr8Ei"}
+  %10 = fir.declare %9 {uniq_name = "_QFpartialsumshflshflr8Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %13 = fir.alloca i32 {bindc_name = "__builtin_warpsize", uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"}
+  %14 = fir.declare %13 {uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %15 = fir.load %2 : !fir.ref<i32>
+  %16 = fir.convert %15 : (i32) -> index
+  %17 = arith.cmpi sgt, %16, %c0 : index
+  %18 = arith.select %17, %16, %c0 : index
+  %19 = fir.shape %18 : (index) -> !fir.shape<1>
+  %20 = fir.declare %arg0(%19) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFpartialsumshflshflr8Ea"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf64>>
+  cf.br ^bb1
+^bb1:  // 2 preds: ^bb0, ^bb2
+  %21 = fir.load %10 : !fir.ref<i32>
+  %22 = arith.cmpi slt, %21, %c10_i32 : i32
+  cf.cond_br %22, ^bb2, ^bb3
+^bb2:  // pred: ^bb1
+  %23 = fir.load %10 : !fir.ref<i32>
+  %24 = arith.muli %23, %c2_i32 : i32
+  %25 = fir.convert %24 : (i32) -> f64
+  %26 = fir.convert %23 : (i32) -> i64
+  %27 = fir.array_coor %20(%19) %26 : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, i64) -> !fir.ref<f64>
+  fir.store %25 to %27 : !fir.ref<f64>
+  cf.br ^bb1
+^bb3:  // pred: ^bb1
+  return
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: gpu.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>>, %arg1: i32) kernel
+      
+// CHECK: func.func @_QPpartialsumshflshflr8
+
+// -----
+
+func.func @_QPsub_maxtnid() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 256 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %cst = arith.constant 2.000000e+00 : f32
+  return
+}
+
+// CHECK: gpu.func @_QPsub_maxtnid() kernel attributes {nvvm.maxntid = array<i32: 256, 1, 1>, nvvm.minctasm = 2 : i64}