[flang-commits] [flang] b05fec9 - [flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present (#113959)
via flang-commits
flang-commits at lists.llvm.org
Tue Oct 29 10:02:12 PDT 2024
Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-10-29T10:02:08-07:00
New Revision: b05fec97d59898a63a3e303122bbc7fc5e29ced8
URL: https://github.com/llvm/llvm-project/commit/b05fec97d59898a63a3e303122bbc7fc5e29ced8
DIFF: https://github.com/llvm/llvm-project/commit/b05fec97d59898a63a3e303122bbc7fc5e29ced8.diff
LOG: [flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present (#113959)
Kernel launch in CUF are converted to `gpu.launch_func`. When the kernel
has `cluster_dims` specified these get carried over to the
`gpu.launch_func` operation. This patch updates the special conversion
of `gpu.launch_func` when cluster dims are present to the newly added
entry point.
Added:
Modified:
flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 5645ce6e6858c8..c64f35542a6e59 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -76,11 +76,6 @@ struct GPULaunchKernelConversion
mlir::LogicalResult
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
-
- if (op.hasClusterSize()) {
- return mlir::failure();
- }
-
mlir::Location loc = op.getLoc();
auto *ctx = rewriter.getContext();
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
@@ -107,37 +102,65 @@ struct GPULaunchKernelConversion
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
}
- auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
- RTNAME_STRING(CUFLaunchKernel));
-
auto llvmIntPtrType = mlir::IntegerType::get(
ctx, this->getTypeConverter()->getPointerBitwidth(0));
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
- auto funcTy = mlir::LLVM::LLVMFunctionType::get(
- voidTy,
- {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
- llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
- /*isVarArg=*/false);
-
- auto cufLaunchKernel = mlir::SymbolRefAttr::get(
- mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
- if (!funcOp) {
- mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
- rewriter.setInsertionPointToStart(mod.getBody());
- auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
- loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
- launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
- }
mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
- rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
- op, funcTy, cufLaunchKernel,
- mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
- adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
- adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
- adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
- nullPtr});
+ if (op.hasClusterSize()) {
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
+ RTNAME_STRING(CUFLaunchClusterKernel));
+ auto funcTy = mlir::LLVM::LLVMFunctionType::get(
+ voidTy,
+ {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ /*isVarArg=*/false);
+ auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
+ mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
+ if (!funcOp) {
+ mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
+ rewriter.setInsertionPointToStart(mod.getBody());
+ auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
+ loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
+ launchKernelFuncOp.setVisibility(
+ mlir::SymbolTable::Visibility::Private);
+ }
+ rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+ op, funcTy, cufLaunchClusterKernel,
+ mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
+ adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
+ adaptor.getGridSizeX(), adaptor.getGridSizeY(),
+ adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
+ adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
+ dynamicMemorySize, kernelArgs, nullPtr});
+ } else {
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
+ RTNAME_STRING(CUFLaunchKernel));
+ auto funcTy = mlir::LLVM::LLVMFunctionType::get(
+ voidTy,
+ {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ /*isVarArg=*/false);
+ auto cufLaunchKernel = mlir::SymbolRefAttr::get(
+ mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ if (!funcOp) {
+ mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
+ rewriter.setInsertionPointToStart(mod.getBody());
+ auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
+ loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ launchKernelFuncOp.setVisibility(
+ mlir::SymbolTable::Visibility::Private);
+ }
+ rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+ op, funcTy, cufLaunchKernel,
+ mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
+ adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
+ adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
+ adaptor.getBlockSizeZ(), dynamicMemorySize,
+ kernelArgs, nullPtr});
+ }
return mlir::success();
}
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index f10bd82f978dc4..7fede7c6c17b78 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-gpu-convert-to-llvm %s | FileCheck %s
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git at github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_QMmod1Phost_sub() {
@@ -102,3 +102,25 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git at github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+ llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
+ llvm.return
+ }
+ llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
+ %0 = llvm.mlir.constant(1 : index) : i64
+ %1 = llvm.mlir.constant(2 : index) : i64
+ %2 = llvm.mlir.constant(0 : i32) : i32
+ %3 = llvm.mlir.constant(10 : index) : i64
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
+ llvm.return
+ }
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
+// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
More information about the flang-commits
mailing list