[flang-commits] [flang] 7670d88 - [flang][cuda] Set kernel intent(in) as const __restrict__ (#203652)
via flang-commits
flang-commits at lists.llvm.org
Fri Jun 12 16:49:14 PDT 2026
Author: Valentin Clement (バレンタイン クレメン)
Date: 2026-06-12T23:49:07Z
New Revision: 7670d88e7ea753095f753e6616b66f5e74da42ef
URL: https://github.com/llvm/llvm-project/commit/7670d88e7ea753095f753e6616b66f5e74da42ef
DIFF: https://github.com/llvm/llvm-project/commit/7670d88e7ea753095f753e6616b66f5e74da42ef.diff
LOG: [flang][cuda] Set kernel intent(in) as const __restrict__ (#203652)
Set attributes on `intent(in)` so `ld.global.nc` is generated by the
backend.
Added:
Modified:
flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
flang/test/Fir/CUDA/cuda-device-func-transform.mlir
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
index 88382cafcc3bd..d08940de0a739 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
@@ -41,6 +41,41 @@ class CUFDeviceFuncTransform
using CUFDeviceFuncTransformBase<
CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;
+ // Decorate INTENT(IN) kernel arguments like C "const __restrict__". NVPTX
+ // only tags loads as invariant (and lowers them to ld.global.nc) when a
+ // kernel pointer parameter is both readonly and noalias; see
+ // NVPTXTagInvariantLoads.
+ static void setIntentInKernelArgAttrs(mlir::func::FuncOp funcOp,
+ gpu::GPUFuncOp deviceFuncOp) {
+ mlir::UnitAttr unitAttr = mlir::UnitAttr::get(funcOp.getContext());
+
+ auto markArg = [&](unsigned argIndex) {
+ if (argIndex >= deviceFuncOp.getNumArguments())
+ return;
+ deviceFuncOp.setArgAttr(
+ argIndex, mlir::LLVM::LLVMDialect::getReadonlyAttrName(), unitAttr);
+ deviceFuncOp.setArgAttr(
+ argIndex, mlir::LLVM::LLVMDialect::getNoAliasAttrName(), unitAttr);
+ };
+
+ funcOp.walk([&](fir::DeclareOp declareOp) {
+ auto var =
+ mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
+ if (!var.isIntentIn())
+ return;
+ if (std::optional<uint32_t> dummyArgNo = declareOp.getDummyArgNo()) {
+ // Dummy argument numbers are 1-based in FIR.
+ markArg(*dummyArgNo - 1);
+ return;
+ }
+ if (auto blockArg =
+ mlir::dyn_cast<mlir::BlockArgument>(declareOp.getMemref()))
+ if (blockArg.getOwner()->isEntryBlock() &&
+ blockArg.getOwner()->getParentOp() == funcOp)
+ markArg(blockArg.getArgNumber());
+ });
+ }
+
static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
bool isGlobal, int computeCap) {
mlir::OpBuilder builder(funcOp.getContext());
@@ -67,6 +102,9 @@ class CUFDeviceFuncTransform
auto deviceFuncOp =
gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
mlir::TypeRange{}, mlir::TypeRange{});
+ if (mlir::ArrayAttr argAttrs = funcOp.getAllArgAttrs())
+ deviceFuncOp.setAllArgAttrs(argAttrs);
+ setIntentInKernelArgAttrs(funcOp, deviceFuncOp);
if (isGlobal)
deviceFuncOp.setKernel(true);
diff --git a/flang/test/Fir/CUDA/cuda-device-func-transform.mlir b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
index cc183f0144e82..ad15b93ef43ef 100644
--- a/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
+++ b/flang/test/Fir/CUDA/cuda-device-func-transform.mlir
@@ -34,7 +34,7 @@ func.func private @_QMmod1Psub1(!fir.ref<!fir.array<10xi32>> {cuf.data_attr = #c
// CHECK: gpu.func @_QPsub_device1()
-// CHECK: gpu.func @_QPsub_device2(%[[ARG0:.*]]: !fir.ref<f32>) {
+// CHECK: gpu.func @_QPsub_device2(%[[ARG0:.*]]: !fir.ref<f32>
// CHECK: %[[DECL:.*]] = fir.declare %[[ARG0]] {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
// CHECK: %[[CST:.*]] = arith.constant 2.000000e+00 : f32
// CHECK: fir.store %[[CST]] to %[[DECL]] : !fir.ref<f32>
@@ -151,12 +151,26 @@ func.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>> {cuf.data_
}
// CHECK-LABEL: gpu.module @cuda_device_mod
-// CHECK: gpu.func @_QPpartialsumshflshflr8(%arg0: !fir.ref<!fir.array<?xf64>>, %arg1: i32) kernel
+// CHECK: gpu.func @_QPpartialsumshflshflr8({{.*}}) kernel
// CHECK: func.func @_QPpartialsumshflshflr8
// -----
+func.func @_QPldg_attrs(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}, %arg1: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "b"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %c10 = arith.constant 10 : index
+ %scope = fir.dummy_scope : !fir.dscope
+ %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+ %0 = fir.declare %arg0(%shape) dummy_scope %scope arg 1 {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFldg_attrsEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf32>>
+ %1 = fir.declare %arg1(%shape) dummy_scope %scope arg 2 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFldg_attrsEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf32>>
+ return
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: gpu.func @_QPldg_attrs(%{{.*}}: !fir.ref<!fir.array<?xf32>>{{.*}}, %{{.*}}: !fir.ref<!fir.array<?xf32>> {{{.*}}llvm.noalias, llvm.readonly}) kernel
+
+// -----
+
func.func @_QPsub_maxtnid() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 256 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
%cst = arith.constant 2.000000e+00 : f32
return
More information about the flang-commits
mailing list