[flang-commits] [flang] [flang][cuda] Carry over the cuf.proc_attr attribute to gpu.launch_func (PR #124325)
via flang-commits
flang-commits at lists.llvm.org
Fri Jan 24 11:04:24 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
The information is needed to launch kernel with `grid_global` as cooperative kernel. `gpu.launch_func` has no attribute to carry this information. Carry over the cuf attribute so we can leverage it in the conversion to function call.
---
Full diff: https://github.com/llvm/llvm-project/pull/124325.diff
2 Files Affected:
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+5)
- (modified) flang/test/Fir/CUDA/cuda-launch.fir (+21)
``````````diff
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index b0d6b0f0993a61..6a6a56bb63c366 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -806,6 +806,7 @@ struct CUFLaunchOpConversion
rewriter.getContext(),
op.getCallee().getLeafReference().getValue())});
mlir::Value clusterDimX, clusterDimY, clusterDimZ;
+ cuf::ProcAttributeAttr procAttr;
if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
op.getCallee().getLeafReference())) {
if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
@@ -817,6 +818,8 @@ struct CUFLaunchOpConversion
clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
loc, clusterDimsAttr.getZ().getInt());
}
+ procAttr =
+ funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
}
llvm::SmallVector<mlir::Value> args;
for (mlir::Value arg : op.getArgs()) {
@@ -851,6 +854,8 @@ struct CUFLaunchOpConversion
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
}
+ if (procAttr)
+ gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
rewriter.replaceOp(op, gpuLaunchOp);
return mlir::success();
}
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 8432b9ec926e38..7833fc7b490bfb 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -104,3 +104,24 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMdevptrPtest() kernel {
+ gpu.return
+ }
+ }
+ func.func @_QMdevptrPtest() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+ return
+ }
+ func.func @_QQmain() {
+ %c1_i32 = arith.constant 1 : i32
+ cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+ return
+ }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
``````````
</details>
https://github.com/llvm/llvm-project/pull/124325
More information about the flang-commits
mailing list