[flang-commits] [flang] [flang][cuda] Add CUFDeviceFuncTransform pass (PR #174487)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Mon Jan 5 13:47:20 PST 2026
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/174487
This pass transform CUDA Fortran func.func to gpu.func and put them in the gpu.module.
>From fca025148d41f1f7d6e0bcbc1c4309a4d1f83e67 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 5 Jan 2026 13:33:51 -0800
Subject: [PATCH] [flang][cuda] Add CUFDeviceFuncTransform pass
---
.../flang/Optimizer/Transforms/Passes.td | 10 +
flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 +
.../CUDA/CUFDeviceFuncTransform.cpp | 248 ++++++++++++++++++
.../CUDA/cuda-device-func-transform-cc90.mlir | 8 +
.../Fir/CUDA/cuda-device-func-transform.mlir | 165 ++++++++++++
5 files changed, 432 insertions(+)
create mode 100644 flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
create mode 100644 flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
create mode 100644 flang/test/Fir/CUDA/cuda-device-func-transform.mlir
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index a2a3341bfa667..47ffc4be93b33 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -523,6 +523,16 @@ def CUFComputeSharedMemoryOffsetsAndSize
];
}
+def CUFDeviceFuncTransform
+ : Pass<"cuf-transform-device-func", "::mlir::ModuleOp"> {
+ let summary = "Transform device function to GPU func";
+ let dependentDialects = ["mlir::gpu::GPUDialect",
+ "mlir::cf::ControlFlowDialect",
+ "mlir::NVVM::NVVMDialect"];
+ let options = [Option<"computeCap", "compute-capability", "int",
+ /*default=*/"0", "CUDA compute capability version">];
+}
+
def CUFLaunchAttachAttr : Pass<"cuf-launch-attach-attr", ""> {
let summary = "Attach CUDA attribute to CUF kernel generated launch";
let description = [{
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 1e2d6e8f15e92..4496e80aa7c40 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_flang_library(FIRTransforms
ConstantArgumentGlobalisation.cpp
ControlFlowConverter.cpp
CUDA/CUFAllocationConversion.cpp
+ CUDA/CUFDeviceFuncTransform.cpp
CUDA/CUFLaunchAttachAttr.cpp
CUDA/CUFPredefinedVarToGPU.cpp
CUFAddConstructor.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
new file mode 100644
index 0000000000000..4be174fe7a4ea
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
@@ -0,0 +1,248 @@
+//===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+
+namespace {
+
+class CUFDeviceFuncTransform
+ : public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
+ using CUFDeviceFuncTransformBase<
+ CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;
+
+ static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
+ bool isGlobal, int computeCap) {
+ mlir::OpBuilder builder(funcOp.getContext());
+
+ mlir::Region &funcOpBody = funcOp.getBody();
+ SetVector<Value> operands;
+ for (mlir::Value operand : funcOp.getArguments())
+ operands.insert(operand);
+
+ llvm::SmallVector<mlir::Type> funcOperandTypes;
+ llvm::SmallVector<mlir::Type> funcResultTypes;
+ funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
+ funcResultTypes.reserve(funcOp.getResultTypes().size());
+ for (mlir::Type opTy : funcOp.getArgumentTypes())
+ funcOperandTypes.push_back(opTy);
+ for (mlir::Type resTy : funcOp.getResultTypes())
+ funcResultTypes.push_back(resTy);
+
+ mlir::Location loc = funcOp.getLoc();
+
+ mlir::FunctionType type = mlir::FunctionType::get(
+ funcOp.getContext(), funcOperandTypes, funcResultTypes);
+
+ auto deviceFuncOp =
+ gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
+ mlir::TypeRange{}, mlir::TypeRange{});
+ if (isGlobal)
+ deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+ builder.getUnitAttr());
+
+ mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
+ mlir::Block &entryBlock = deviceFuncBody.front();
+
+ mlir::IRMapping map;
+ for (const auto &operand : enumerate(operands))
+ map.map(operand.value(), entryBlock.getArgument(operand.index()));
+
+ funcOpBody.cloneInto(&deviceFuncBody, map);
+
+ deviceFuncOp.walk([](func::ReturnOp op) {
+ mlir::OpBuilder replacer(op);
+ gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
+ gpuReturnOp->setOperands(op.getOperands());
+ op.erase();
+ });
+
+ mlir::Block &funcOpEntry = funcOp.front();
+ mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);
+
+ entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+ clonedFuncOpEntry->getOperations());
+ clonedFuncOpEntry->erase();
+
+ auto launchBoundsAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
+ cuf::getLaunchBoundsAttrName());
+ if (launchBoundsAttr) {
+ auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
+ auto maxntid =
+ builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
+ deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
+ deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
+ launchBoundsAttr.getMinBPM());
+ if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
+ deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
+ launchBoundsAttr.getUpperBoundClusterSize());
+ }
+
+ return deviceFuncOp;
+ }
+
+ static void createHostStub(mlir::func::FuncOp funcOp,
+ mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
+ mlir::Location loc = funcOp.getLoc();
+ mlir::OpBuilder modBuilder(mod.getBodyRegion());
+ modBuilder.setInsertionPointToEnd(mod.getBody());
+ auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
+ funcOp.getFunctionType());
+ emptyStub.setVisibility(funcOp.getVisibility());
+ emptyStub->setAttrs(funcOp->getAttrs());
+ auto entryBlock = emptyStub.addEntryBlock();
+ modBuilder.setInsertionPointToEnd(entryBlock);
+ func::ReturnOp::create(modBuilder, loc);
+
+ symTab.erase(funcOp);
+ symTab.insert(emptyStub);
+ }
+
+ static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
+ if (auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName()))
+ if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
+ return true;
+ return false;
+ }
+
+ void runOnOperation() override {
+ // Working on Module operation because inserting/removing function from the
+ // module is not thread-safe.
+ ModuleOp mod = getOperation();
+ mlir::SymbolTable symbolTable(getOperation());
+
+ auto *ctx = getOperation().getContext();
+ mlir::OpBuilder builder(ctx);
+
+ gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
+ mlir::SymbolTable gpuModSymTab(gpuMod);
+
+ llvm::SetVector<mlir::func::FuncOp> funcsToClone;
+ llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
+ llvm::SetVector<mlir::func::FuncOp> keepInModule;
+ llvm::StringSet<> deviceFuncNames;
+
+ // Look for all function to migrate to the GPU module.
+ mod.walk([&](mlir::func::FuncOp op) {
+ if (isDeviceFunc(op)) {
+ deviceFuncs.insert(op);
+ deviceFuncNames.insert(op.getSymName());
+ }
+ });
+
+ auto processCallOp = [&](fir::CallOp op) {
+ if (op.getCallee()) {
+ auto func = symbolTable.lookup<mlir::func::FuncOp>(
+ op.getCallee()->getLeafReference());
+ if (deviceFuncs.count(func) == 0)
+ funcsToClone.insert(func);
+ }
+ };
+
+ // Gather all function called by device functions.
+ for (auto funcOp : deviceFuncs) {
+ funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
+ funcOp.walk([&](fir::DispatchOp op) {
+ TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
+ "in device procedure");
+ });
+ }
+
+ // Functions that are referenced in a derived-type binding table must be
+ // kept in the host module to avoid LLVM dialect verification errors.
+ for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
+ if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
+ globalOp.walk([&](fir::AddrOfOp addrOfOp) {
+ if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
+ keepInModule.insert(
+ *llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
+ return f.getSymName() ==
+ addrOfOp.getSymbol().getLeafReference();
+ }));
+ });
+ }
+ }
+
+ // Gather all functions called by CUF kernels.
+ mod.walk([&](cuf::KernelOp kernelOp) {
+ kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
+ kernelOp.walk([&](fir::DispatchOp op) {
+ TODO(op.getLoc(),
+ "type-bound procedure call with dynamic dispatch in cuf kernel");
+ });
+ });
+
+ for (auto funcOp : funcsToClone)
+ gpuModSymTab.insert(funcOp->clone());
+
+ for (auto funcOp : deviceFuncs) {
+ auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName());
+ auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+ if (funcOp.isDeclaration()) {
+ mlir::Operation *clonedFuncOp = funcOp->clone();
+ if (isGlobal) {
+ clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+ builder.getUnitAttr());
+ clonedFuncOp->removeAttr(cuf::getProcAttrName());
+ }
+ gpuModSymTab.insert(clonedFuncOp);
+ } else {
+ gpu::GPUFuncOp deviceFuncOp =
+ createGPUFuncOp(funcOp, isGlobal, computeCap);
+ gpuModSymTab.insert(deviceFuncOp);
+
+ if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
+ // If the function is a global, we need to keep the host side
+ // declaration for the kernel registration. Currently we just
+ // erase its body but in the future, the body should be rewritten
+ // to be able to launch CUDA Fortran kernel from C code.
+ if (isGlobal || keepInModule.contains(funcOp))
+ createHostStub(funcOp, symbolTable, mod);
+ else
+ funcOp.erase();
+ }
+ }
+ }
+ }
+};
+
+} // end anonymous namespace
diff --git a/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir b/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
new file mode 100644
index 0000000000000..793388fc9dd5d
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-func-transform-cc90.mlir
@@ -0,0 +1,8 @@
+// RUN: fir-opt --split-input-file --cuf-transform-device-func="compute-capability=90" %s | FileCheck %s
+
+func.func @_QPsub_maxtnid() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 256 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %cst = arith.constant 2.000000e+00 : f32
+ return
+}
+
+// CHECK: gpu.func @_QPsub_maxtnid() kernel attributes {nvvm.cluster_max_blocks = 3 : i64, nvvm.maxntid = array<i32: 256, 1, 1>, nvvm.minctasm = 2 : i64}
diff --git a/flang/test/Fir/CUDA/cuda-device-func-transform.mlir b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
new file mode 100644
index 0000000000000..1ce21aeecffe5
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
@@ -0,0 +1,165 @@
+// RUN: fir-opt --split-input-file --cuf-transform-device-func %s | FileCheck %s
+
+func.func @_QPsub_device1() attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+ return
+}
+
+func.func @_QPsub_device2(%arg0: !fir.ref<f32> {fir.bindc_name = "i", cuf.proc_attr = #cuf.cuda_proc<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+ %0 = fir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+ %cst = arith.constant 2.000000e+00 : f32
+ fir.store %cst to %0 : !fir.ref<f32>
+ return
+}
+
+func.func @_QPsub_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %cst = arith.constant 2.000000e+00 : f32
+ return
+}
+
+func.func @_QPsub_host_device1() attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>} {
+ return
+}
+
+func.func private @_QMmod1Psub1(!fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+
+// CHECK-LABEL: module attributes {gpu.container_module}
+
+// CHECK-NOT: func.func @_QPsub_device1()
+
+// CHECK-NOT: func.func @_QPsub_device2(%arg0: !fir.ref<f32> {fir.bindc_name = "i", cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
+
+// CHECK: func.func @_QPsub_host_device1()
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+
+// CHECK: gpu.func @_QPsub_device1()
+
+// CHECK: gpu.func @_QPsub_device2(%[[ARG0:.*]]: !fir.ref<f32>) {
+// CHECK: %[[DECL:.*]] = fir.declare %[[ARG0]] {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK: %[[CST:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: fir.store %[[CST]] to %[[DECL]] : !fir.ref<f32>
+// CHECK: gpu.return
+// CHECK: }
+
+// CHECK: gpu.func @_QPsub_global1() kernel
+
+// CHECK: gpu.func @_QPsub_host_device1()
+
+// CHECK: func.func private @_QMmod1Psub1(!fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>}) attributes {gpu.kernel}
+
+// CHECK: func.func @_QPsub_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+// CHECK-NEXT: return
+
+// -----
+
+func.func @_QPdevsub() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+ %c1_i32 = arith.constant 1 : i32
+ return %c1_i32 : i32
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+
+// CHECK: gpu.func @_QPdevsub() -> i32
+
+// CHECK: gpu.return %{{.*}} : i32
+
+// -----
+
+func.func @hostFuncUsedInDevice() {
+ return
+}
+
+func.func @_QPsub_device4() attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+ fir.call @hostFuncUsedInDevice() : () -> ()
+ return
+}
+
+// CHECK-LABEL: module attributes {gpu.container_module}
+// CHECK: func.func @hostFuncUsedInDevice()
+// CHECK: gpu.module @cuda_device_mod
+// CHECK: func.func @hostFuncUsedInDevice()
+// CHECK: gpu.func @_QPsub_device4()
+// CHECK: fir.call @hostFuncUsedInDevice() : () -> ()
+
+// -----
+
+func.func @_QPsub_grid_global1() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+ %cst = arith.constant 2.000000e+00 : f32
+ return
+}
+
+// CHEC-LABEL: gpu.module @cuda_device_mod {
+// CHECK: gpu.func @_QPsub_grid_global1() kernel
+
+// CHECK-LABEL: func.func @_QPsub_grid_global1()
+
+// -----
+
+func.func @hostFuncUsedInDevice() {
+ return
+}
+
+func.func @_QPsub_host() {
+ %c1 = arith.constant 1 : index
+ %c1_i32 = arith.constant 1 : i32
+ cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg0 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+ fir.call @hostFuncUsedInDevice() : () -> ()
+ "fir.end"() : () -> ()
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @hostFuncUsedInDevice()
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: func.func @hostFuncUsedInDevice()
+
+// -----
+
+func.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "n"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %c2_i32 = arith.constant 2 : i32
+ %c10_i32 = arith.constant 10 : i32
+ %c0 = arith.constant 0 : index
+ %0 = fir.dummy_scope : !fir.dscope
+ %1 = fir.alloca i32
+ fir.store %arg1 to %1 : !fir.ref<i32>
+ %2 = fir.declare %1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in, value>, uniq_name = "_QFpartialsumshflshflr8En"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+ %9 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFpartialsumshflshflr8Ei"}
+ %10 = fir.declare %9 {uniq_name = "_QFpartialsumshflshflr8Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %13 = fir.alloca i32 {bindc_name = "__builtin_warpsize", uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"}
+ %14 = fir.declare %13 {uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %15 = fir.load %2 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> index
+ %17 = arith.cmpi sgt, %16, %c0 : index
+ %18 = arith.select %17, %16, %c0 : index
+ %19 = fir.shape %18 : (index) -> !fir.shape<1>
+ %20 = fir.declare %arg0(%19) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFpartialsumshflshflr8Ea"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf64>>
+ cf.br ^bb1
+^bb1: // 2 preds: ^bb0, ^bb2
+ %21 = fir.load %10 : !fir.ref<i32>
+ %22 = arith.cmpi slt, %21, %c10_i32 : i32
+ cf.cond_br %22, ^bb2, ^bb3
+^bb2: // pred: ^bb1
+ %23 = fir.load %10 : !fir.ref<i32>
+ %24 = arith.muli %23, %c2_i32 : i32
+ %25 = fir.convert %24 : (i32) -> f64
+ %26 = fir.convert %23 : (i32) -> i64
+ %27 = fir.array_coor %20(%19) %26 : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>, i64) -> !fir.ref<f64>
+ fir.store %25 to %27 : !fir.ref<f64>
+ cf.br ^bb1
+^bb3: // pred: ^bb1
+ return
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: gpu.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>>, %arg1: i32) kernel
+
+// CHECK: func.func @_QPpartialsumshflshflr8
+
+// -----
+
+func.func @_QPsub_maxtnid() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 256 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %cst = arith.constant 2.000000e+00 : f32
+ return
+}
+
+// CHECK: gpu.func @_QPsub_maxtnid() kernel attributes {nvvm.maxntid = array<i32: 256, 1, 1>, nvvm.minctasm = 2 : i64}
More information about the flang-commits
mailing list