[flang-commits] [flang] [flang][cuda] Handle launch of cooperative kernel (PR #124362)
via flang-commits
flang-commits at lists.llvm.org
Fri Jan 24 14:44:42 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Add `CUFLaunchCooperativeKernel` entry points and lower gpu.launch_func with grid_global attribute to this entry point.
---
Full diff: https://github.com/llvm/llvm-project/pull/124362.diff
4 Files Affected:
- (modified) flang/include/flang/Runtime/CUDA/kernel.h (+4)
- (modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+12-6)
- (modified) flang/runtime/CUDA/kernel.cpp (+65)
- (modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+35)
``````````diff
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
int32_t smem, void **params, void **extra);
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra);
+
} // extern "C"
#endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
dynamicMemorySize, kernelArgs, nullPtr});
} else {
- auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
- RTNAME_STRING(CUFLaunchKernel));
+ auto procAttr =
+ op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+ bool isGridGlobal =
+ procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+ llvm::StringRef fctName = isGridGlobal
+ ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+ : RTNAME_STRING(CUFLaunchKernel);
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
- auto cufLaunchKernel = mlir::SymbolRefAttr::get(
- mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ auto cufLaunchKernel =
+ mlir::SymbolRefAttr::get(mod.getContext(), fctName);
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
- auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
- loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ auto launchKernelFuncOp =
+ rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
}
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra) {
+ dim3 gridDim;
+ gridDim.x = gridX;
+ gridDim.y = gridY;
+ gridDim.z = gridZ;
+ dim3 blockDim;
+ blockDim.x = blockX;
+ blockDim.y = blockY;
+ blockDim.z = blockZ;
+ unsigned nbNegGridDim{0};
+ if (gridX < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridY < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridZ < 0) {
+ ++nbNegGridDim;
+ }
+ if (nbNegGridDim == 1) {
+ int maxBlocks, nbBlocks, dev, multiProcCount;
+ cudaError_t err1, err2;
+ nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+ cudaGetDevice(&dev);
+ err1 = cudaDeviceGetAttribute(
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+ &maxBlocks, kernel, nbBlocks, smem);
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
+ maxBlocks = multiProcCount * maxBlocks;
+ }
+ if (maxBlocks > 0) {
+ if (gridX > 0) {
+ maxBlocks = maxBlocks / gridDim.x;
+ }
+ if (gridY > 0) {
+ maxBlocks = maxBlocks / gridDim.y;
+ }
+ if (gridZ > 0) {
+ maxBlocks = maxBlocks / gridDim.z;
+ }
+ if (maxBlocks < 1) {
+ maxBlocks = 1;
+ }
+ if (gridX < 0) {
+ gridDim.x = maxBlocks;
+ }
+ if (gridY < 0) {
+ gridDim.y = maxBlocks;
+ }
+ if (gridZ < 0) {
+ gridDim.z = maxBlocks;
+ }
+ }
+ } else if (nbNegGridDim > 1) {
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Too many invalid grid dimensions");
+ }
+ cudaStream_t stream = 0; // TODO stream managment
+ CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+ kernel, gridDim, blockDim, params, smem, stream));
+}
+
} // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git at github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Phost_sub() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+ %2 = llvm.mlir.constant(40 : i64) : i64
+ %3 = llvm.mlir.constant(16 : i32) : i32
+ %4 = llvm.mlir.constant(25 : i32) : i32
+ %5 = llvm.mlir.constant(21 : i32) : i32
+ %6 = llvm.mlir.constant(17 : i32) : i32
+ %7 = llvm.mlir.constant(1 : index) : i64
+ %8 = llvm.mlir.constant(27 : i32) : i32
+ %9 = llvm.mlir.constant(6 : i32) : i32
+ %10 = llvm.mlir.constant(1 : i32) : i32
+ %11 = llvm.mlir.constant(0 : i32) : i32
+ %12 = llvm.mlir.constant(10 : index) : i64
+ %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+ %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+ llvm.return
+ }
+ llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+ llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+ %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+ llvm.return %0 : !llvm.array<2 x i8>
+ }
+ llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
``````````
</details>
https://github.com/llvm/llvm-project/pull/124362
More information about the flang-commits
mailing list