[Mlir-commits] [mlir] a63db3f - [mlir][gpu] Modifies `gpu.launch_func` to allow lowering it after gpu-to-llvm.
Fabian Mora
llvmlistbot at llvm.org
Fri Aug 11 14:56:45 PDT 2023
Author: Fabian Mora
Date: 2023-08-11T21:56:37Z
New Revision: a63db3f5f5dc5bfad9a63492c34fc1f6bd012e96
URL: https://github.com/llvm/llvm-project/commit/a63db3f5f5dc5bfad9a63492c34fc1f6bd012e96
DIFF: https://github.com/llvm/llvm-project/commit/a63db3f5f5dc5bfad9a63492c34fc1f6bd012e96.diff
LOG: [mlir][gpu] Modifies `gpu.launch_func` to allow lowering it after gpu-to-llvm.
**For an explanation of these patches see D154153.**
Commit message:
In order to lower `gpu.launch_func` after running `gpu-to-llvm` it must be
able to handle lowered types -eg. index -> i64. This patch also allows the op
to refer to GPU binaries and not only GPU modules.
Depends on D154132.
Reviewed By: mehdi_amini
Differential Revision: https://reviews.llvm.org/D154137
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
mlir/test/Dialect/GPU/invalid.mlir
mlir/test/Dialect/GPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 5be3a8fa2a9a37..2f8cb968f18b00 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -431,14 +431,19 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
let hasVerifier = 1;
}
-def GPU_LaunchFuncOp : GPU_Op<"launch_func",
- [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
+def LaunchIndx : AnyTypeOf<[Index, I32, I64]>;
+
+def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
+ GPU_AsyncOpInterface, AttrSizedOperandSegments,
+ AllTypesMatch<["gridSizeX", "gridSizeY", "gridSizeZ", "blockSizeX",
+ "blockSizeY", "blockSizeZ"]>]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
SymbolRefAttr:$kernel,
- Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
- Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+ LaunchIndx:$gridSizeX, LaunchIndx:$gridSizeY, LaunchIndx:$gridSizeZ,
+ LaunchIndx:$blockSizeX, LaunchIndx:$blockSizeY, LaunchIndx:$blockSizeZ,
Optional<I32>:$dynamicSharedMemorySize,
- Variadic<AnyType>:$kernelOperands)>,
+ Variadic<AnyType>:$kernelOperands,
+ Optional<AnyType>:$asyncObject)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "Launches a function as a GPU kernel";
@@ -529,7 +534,11 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
"ValueRange":$kernelOperands,
CArg<"Type", "nullptr">:$asyncTokenType,
- CArg<"ValueRange", "{}">:$asyncDependencies)>
+ CArg<"ValueRange", "{}">:$asyncDependencies)>,
+ OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
+ "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
+ "ValueRange":$kernelOperands,
+ CArg<"Value", "nullptr">:$asyncObject)>
];
let extraClassDeclaration = [{
@@ -559,9 +568,11 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+ (`<` $asyncObject^ `:` type($asyncObject) `>`)?
$kernel
- `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
- `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
+ `blocks` `in` ` ` `(` $gridSizeX `,` $gridSizeY `,` $gridSizeZ `)`
+ `threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)`
+ custom<LaunchDimType>(type($gridSizeX))
(`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
}];
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e5c7080f74d466..3fcc816d09a7c6 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -326,12 +326,22 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
LaunchFuncOp::getKernelAttrName(launchOp->getName())))
return success();
- // Check that `launch_func` refers to a well-formed GPU kernel module.
- StringAttr kernelModuleName = launchOp.getKernelModuleName();
- auto kernelModule = module.lookupSymbol<GPUModuleOp>(kernelModuleName);
+ // Check that `launch_func` refers to a well-formed GPU kernel container.
+ StringAttr kernelContainerName = launchOp.getKernelModuleName();
+ Operation *kernelContainer = module.lookupSymbol(kernelContainerName);
+ if (!kernelContainer)
+ return launchOp.emitOpError()
+ << "kernel container '" << kernelContainerName.getValue()
+ << "' is undefined";
+
+ // If the container is a GPU binary op return success.
+ if (isa<BinaryOp>(kernelContainer))
+ return success();
+
+ auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer);
if (!kernelModule)
return launchOp.emitOpError()
- << "kernel module '" << kernelModuleName.getValue()
+ << "kernel module '" << kernelContainerName.getValue()
<< "' is undefined";
// Check that `launch_func` refers to a well-formed kernel function.
@@ -988,13 +998,45 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
auto kernelSymbol =
SymbolRefAttr::get(kernelModule.getNameAttr(),
{SymbolRefAttr::get(kernelFunc.getNameAttr())});
- result.addAttribute(getKernelAttrName(result.name), kernelSymbol);
- SmallVector<int32_t, 9> segmentSizes(9, 1);
- segmentSizes.front() = asyncDependencies.size();
- segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0;
- segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
- result.addAttribute(getOperandSegmentSizeAttr(),
- builder.getDenseI32ArrayAttr(segmentSizes));
+
+ Properties &prop = result.getOrAddProperties<Properties>();
+ prop.kernel = kernelSymbol;
+ size_t segmentSizesLen = std::size(prop.operandSegmentSizes);
+ // Initialize the segment sizes to 1.
+ for (auto &sz : prop.operandSegmentSizes)
+ sz = 1;
+ prop.operandSegmentSizes[0] = asyncDependencies.size();
+ prop.operandSegmentSizes[segmentSizesLen - 3] =
+ dynamicSharedMemorySize ? 1 : 0;
+ prop.operandSegmentSizes[segmentSizesLen - 2] =
+ static_cast<int32_t>(kernelOperands.size());
+ prop.operandSegmentSizes[segmentSizesLen - 1] = 0;
+}
+
+void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
+ SymbolRefAttr kernel, KernelDim3 gridSize,
+ KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
+ ValueRange kernelOperands, Value asyncObject) {
+ // Add grid and block sizes as op operands, followed by the data operands.
+ result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
+ getBlockSize.y, getBlockSize.z});
+ if (dynamicSharedMemorySize)
+ result.addOperands(dynamicSharedMemorySize);
+ result.addOperands(kernelOperands);
+ if (asyncObject)
+ result.addOperands(asyncObject);
+ Properties &prop = result.getOrAddProperties<Properties>();
+ prop.kernel = kernel;
+ size_t segmentSizesLen = std::size(prop.operandSegmentSizes);
+ // Initialize the segment sizes to 1.
+ for (auto &sz : prop.operandSegmentSizes)
+ sz = 1;
+ prop.operandSegmentSizes[0] = 0;
+ prop.operandSegmentSizes[segmentSizesLen - 3] =
+ dynamicSharedMemorySize ? 1 : 0;
+ prop.operandSegmentSizes[segmentSizesLen - 2] =
+ static_cast<int32_t>(kernelOperands.size());
+ prop.operandSegmentSizes[segmentSizesLen - 1] = asyncObject ? 1 : 0;
}
StringAttr LaunchFuncOp::getKernelModuleName() {
@@ -1037,6 +1079,22 @@ LogicalResult LaunchFuncOp::verify() {
return success();
}
+static ParseResult parseLaunchDimType(OpAsmParser &parser, Type &dimTy) {
+ if (succeeded(parser.parseOptionalColon())) {
+ if (parser.parseType(dimTy))
+ return failure();
+ } else {
+ dimTy = IndexType::get(parser.getContext());
+ }
+ return success();
+}
+
+static void printLaunchDimType(OpAsmPrinter &printer, Operation *op,
+ Type dimTy) {
+ if (!dimTy.isIndex())
+ printer << ": " << dimTy;
+}
+
static ParseResult parseLaunchFuncOperands(
OpAsmParser &parser,
SmallVectorImpl<OpAsmParser::UnresolvedOperand> &argNames,
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index bebbcebe66ce75..c8c0b7d24bc3ab 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -57,7 +57,7 @@ module attributes {gpu.container_module} {
func.func @launch_func_missing_callee_attribute(%sz : index) {
// expected-error at +1 {{'gpu.launch_func' op requires attribute 'kernel'}}
"gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
- {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0>}
+ {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0>}
: (index, index, index, index, index, index) -> ()
return
}
@@ -77,7 +77,7 @@ module attributes {gpu.container_module} {
module attributes {gpu.container_module} {
func.func @launch_func_undefined_module(%sz : index) {
- // expected-error at +1 {{kernel module 'kernels' is undefined}}
+ // expected-error at +1 {{kernel container 'kernels' is undefined}}
gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
return
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 7e1f64f69119f5..0ef7cfb854e3e4 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -128,8 +128,10 @@ module attributes {gpu.container_module} {
%1 = "op"() : () -> (memref<?xf32, 1>)
// CHECK: %{{.*}} = arith.constant 8
%cst = arith.constant 8 : index
+ %cstI64 = arith.constant 8 : i64
%c0 = arith.constant 0 : i32
%t0 = gpu.wait async
+ %lowStream = llvm.mlir.null : !llvm.ptr
// CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
@@ -142,6 +144,12 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
%t1 = gpu.launch_func async [%t0] @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+ // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
+ gpu.launch_func <%lowStream : !llvm.ptr> @kernels::@kernel_1 blocks in (%cstI64, %cstI64, %cstI64) threads in (%cstI64, %cstI64, %cstI64) : i64 args(%0 : f32, %1 : memref<?xf32, 1>)
+
+ // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
+ gpu.launch_func @kernels::@kernel_1 blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>)
+
// CHECK: %[[VALUES:.*]]:2 = call
%values:2 = func.call @two_value_generator() : () -> (f32, memref<?xf32, 1>)
// CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref<?xf32, 1>)
More information about the Mlir-commits
mailing list