[Mlir-commits] [mlir] [mlir][gpu] Add Support for Cluster of Thread Blocks in `gpu.launch` (PR #76924)
Guray Ozen
llvmlistbot at llvm.org
Thu Jan 4 06:33:51 PST 2024
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/76924
>From c1862bd473fcdf174e6b2dd310e3eb7b3ba3a7e5 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 10:01:25 +0100
Subject: [PATCH 1/3] [mlir][gpu] Add Support for CGA Clusters in `gpu.launch`
This PR improves `gpu.launch` to handle a new feature called cga cluster. Now, when using `gpu.launch`, one can include a cluster size, although it's opitional. If provided, the outliner will transform `gpu.launch` with the cluster size into `gpu.launch_func`.
Previously, PR #72871 introduced the required support for clusters in the MLIR compiler and its CUDA runtime. This PR builds upon that work.
---
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 54 +++++++++++--
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 79 ++++++++++++++++---
.../GPU/Transforms/KernelOutlining.cpp | 20 +++--
.../SCFToGPU/no_blocks_no_threads.mlir | 4 +-
mlir/test/Dialect/GPU/invalid.mlir | 2 +-
mlir/test/Dialect/GPU/outlining.mlir | 74 +++++++++++++++++
6 files changed, 210 insertions(+), 23 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index efef61b5c6e712..a816d663596169 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -676,8 +676,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
DeclareOpInterfaceMethods<InferIntRangeInterface>,
RecursiveMemoryEffects]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+ Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+ Optional<Index>:$clusterSizeX,
+ Optional<Index>:$clusterSizeY,
+ Optional<Index>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "GPU kernel launch operation";
@@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
to the amount of dynamic shared memory a kernel's workgroup should be
allocated; when this operand is not present, a zero size is assumed.
- The body region has at least _twelve_ arguments, grouped as follows:
+ The body region has at least _twelve_ arguments, or _eighteen_ if cluster
+ dimensions are present, grouped as follows:
+ - three optional arguments that contain cluster identifiers along x,y,z
+ dimensions;
- three arguments that contain block identifiers along x,y,z dimensions;
- three arguments that contain thread identifiers along x,y,z dimensions;
- operands of the `gpu.launch` operation as is (i.e. the operands for
@@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
```
operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
+ ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
`blocks` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(dynamic_shared_memory_size ssa-use)?
@@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// Assuming %val1 is defined outside the gpu.launch region.
%42 = load %workgroup[%bx] : memref<32xf32, 3>
}
+
+ // Launch with clusters.
+ gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
+ blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
+ threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
+ {
+ // Cluster, block and thread identifiers, as well as cluster/block/grid
+ // sizes are immediately usable inside body region.
+ "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
+ }
```
Rationale: using operation/block arguments gives analyses a clear way of
@@ -784,7 +801,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies,
CArg<"TypeRange", "{}">:$workgroupAttributions,
- CArg<"TypeRange", "{}">:$privateAttributions)>
+ CArg<"TypeRange", "{}">:$privateAttributions,
+ CArg<"Value", "nullptr">:$clusterSizeX,
+ CArg<"Value", "nullptr">:$clusterSizeY,
+ CArg<"Value", "nullptr">:$clusterSizeZ)>
];
let extraClassDeclaration = [{
@@ -792,17 +812,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [
KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();
+ /// Get the SSA values corresponding to kernel cluster identifiers.
+ std::optional<KernelDim3> getClusterIds();
/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();
+ /// Get the SSA values corresponding to kernel cluster size.
+ std::optional<KernelDim3> getClusterSize();
/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();
+ /// Get the SSA values passed as operands to specify the cluster size.
+ std::optional<KernelDim3> getClusterSizeOperandValues();
static StringRef getBlocksKeyword() { return "blocks"; }
+ static StringRef getClustersKeyword() { return "clusters"; }
static StringRef getThreadsKeyword() { return "threads"; }
static StringRef getDynamicSharedMemorySizeKeyword() {
return "dynamic_shared_memory_size";
@@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// placed in the leading positions of the argument list.
static constexpr unsigned kNumConfigRegionAttributes = 12;
+ /// Returns true if cluster size is specified.
+ bool hasClusterSize() {
+ if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
+ return true;
+ return false;
+ }
+ /// Returns the number of operands including cluster size
+ unsigned getNumConfigOperands() {
+ return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
+ }
+ /// Returns the number of region attributes including cluster size
+ unsigned getNumConfigRegionAttributes() {
+ return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
+ }
+
/// Returns the keywords used in the custom syntax for this Op.
static StringRef getWorkgroupKeyword() { return "workgroup"; }
static StringRef getPrivateKeyword() { return "private"; }
@@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// the workgroup memory
ArrayRef<BlockArgument> getWorkgroupAttributions() {
auto begin =
- std::next(getBody().args_begin(), kNumConfigRegionAttributes);
+ std::next(getBody().args_begin(), getNumConfigRegionAttributes());
auto end = std::next(begin, getNumWorkgroupAttributions());
return {begin, end};
}
@@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// Returns the number of buffers located in the private memory.
unsigned getNumPrivateAttributions() {
- return getBody().getNumArguments() - kNumConfigRegionAttributes -
+ return getBody().getNumArguments() - getNumConfigRegionAttributes() -
getNumWorkgroupAttributions();
}
@@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// memory.
auto begin =
std::next(getBody().args_begin(),
- kNumConfigRegionAttributes + getNumWorkgroupAttributions());
+ getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
return {begin, getBody().args_end()};
}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index dd482f305fcbc8..459b93cf033a0e 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
Value getBlockSizeZ, Value dynamicSharedMemorySize,
Type asyncTokenType, ValueRange asyncDependencies,
TypeRange workgroupAttributions,
- TypeRange privateAttributions) {
+ TypeRange privateAttributions, Value clusterSizeX,
+ Value clusterSizeY, Value clusterSizeZ) {
// Add a WorkGroup attribution attribute. This attribute is required to
// identify private attributions in the list of block argguments.
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -660,6 +661,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
getBlockSizeY, getBlockSizeZ});
+ if (clusterSizeX && clusterSizeY && clusterSizeZ)
+ result.addOperands({clusterSizeX, clusterSizeY, clusterSizeZ});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
@@ -678,9 +681,14 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
body->addArgument(argTy, result.location);
kernelRegion->push_back(body);
// Fill OperandSegmentSize Attribute.
- SmallVector<int32_t, 8> segmentSizes(8, 1);
+ SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
+ if (!clusterSizeX) {
+ segmentSizes[7] = 0;
+ segmentSizes[8] = 0;
+ segmentSizes[9] = 0;
+ }
result.addAttribute(getOperandSegmentSizeAttr(),
builder.getDenseI32ArrayAttr(segmentSizes));
}
@@ -709,6 +717,22 @@ KernelDim3 LaunchOp::getBlockSize() {
return KernelDim3{args[9], args[10], args[11]};
}
+std::optional<KernelDim3> LaunchOp::getClusterIds() {
+ assert(!getBody().empty() && "LaunchOp body must not be empty.");
+ if (!hasClusterSize())
+ return std::nullopt;
+ auto args = getBody().getArguments();
+ return KernelDim3{args[12], args[13], args[14]};
+}
+
+std::optional<KernelDim3> LaunchOp::getClusterSize() {
+ assert(!getBody().empty() && "LaunchOp body must not be empty.");
+ if (!hasClusterSize())
+ return std::nullopt;
+ auto args = getBody().getArguments();
+ return KernelDim3{args[15], args[16], args[17]};
+}
+
KernelDim3 LaunchOp::getGridSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
@@ -719,6 +743,13 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() {
return KernelDim3{operands[3], operands[4], operands[5]};
}
+std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
+ auto operands = getOperands().drop_front(getAsyncDependencies().size());
+ if (!hasClusterSize())
+ return std::nullopt;
+ return KernelDim3{operands[6], operands[7], operands[8]};
+}
+
LogicalResult LaunchOp::verifyRegions() {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -778,6 +809,12 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << " [" << getAsyncDependencies() << ']';
}
// Print the launch configuration.
+ if (getClusterSizeX()) {
+ p << ' ' << getClustersKeyword();
+ printSizeAssignment(p, getClusterSize().value(),
+ getClusterSizeOperandValues().value(),
+ getClusterIds().value());
+ }
p << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
getBlockIds());
@@ -831,6 +868,7 @@ parseSizeAssignment(OpAsmParser &parser,
/// Parses a Launch operation.
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
+/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional)
/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
/// memory-attribution
@@ -840,7 +878,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
sizes(LaunchOp::kNumConfigOperands);
- MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
// Actual (data) operands passed to the kernel.
SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;
@@ -848,7 +885,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Region arguments to be created.
SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
LaunchOp::kNumConfigRegionAttributes);
- MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
// Parse optional async dependencies.
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
@@ -861,6 +897,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
if (parser.getNumResults() > 0)
result.types.push_back(asyncTokenType);
+ bool hasCluster = false;
+ if (succeeded(
+ parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) {
+ hasCluster = true;
+ sizes.resize(9);
+ regionArgs.resize(18);
+ }
+ MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
+ MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
+
+ // Last three segment assigns the cluster size. In the region argument
+ // list, this is last 6 arguments.
+ if (hasCluster) {
+ if (parseSizeAssignment(parser, sizesRef.drop_front(6),
+ regionArgsRef.slice(15, 3),
+ regionArgsRef.slice(12, 3)))
+ return failure();
+ }
// Parse the size assignment segments: the first segment assigns grid sizes
// and defines values for block identifiers; the second segment assigns block
// sizes and defines values for thread identifiers. In the region argument
@@ -898,7 +952,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// LaunchOp::getNumWorkgroupAttributionsAttrName().
Type index = parser.getBuilder().getIndexType();
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
- LaunchOp::kNumConfigRegionAttributes, index);
+ LaunchOp::kNumConfigRegionAttributes + 6, index);
SmallVector<OpAsmParser::Argument> regionArguments;
for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
@@ -916,8 +970,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
- unsigned numWorkgroupAttrs =
- regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
+ unsigned numWorkgroupAttrs = regionArguments.size() -
+ LaunchOp::kNumConfigRegionAttributes -
+ (hasCluster ? 6 : 0);
result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(numWorkgroupAttrs));
@@ -934,8 +989,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
parser.parseOptionalAttrDict(result.attributes))
return failure();
- SmallVector<int32_t, 8> segmentSizes(8, 1);
+ SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();
+
+ if (!hasCluster) {
+ segmentSizes[7] = 0;
+ segmentSizes[8] = 0;
+ segmentSizes[9] = 0;
+ }
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
@@ -992,7 +1053,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
(*this)->setAttr(attrName,
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
return getBody().insertArgument(
- LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
+ LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 7432a58f18b442..2436113dc4239c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
/// entry block of `launchOpBody`, to the corresponding result value of the
/// added operations.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
- Region &launchOpBody, IRMapping &map) {
+ Region &launchOpBody, IRMapping &map,
+ bool hasCluster = false) {
OpBuilder builder(loc->getContext());
Block &firstBlock = launchOpBody.front();
builder.setInsertionPointToStart(&launchFuncOpBody.front());
- SmallVector<Value, 12> indexOps;
+ SmallVector<Value> indexOps;
+ // The order is important here, as it must match the order of the arguments
createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
+ if (hasCluster) {
+ createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
+ createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
+ }
// Replace the leading 12 function args with the respective thread/block index
// operations. Iterate backwards since args are erased and indices change.
for (const auto &indexOp : enumerate(indexOps))
@@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
IRMapping map;
// Map the arguments corresponding to the launch parameters like blockIdx,
- // threadIdx, etc.
+ // threadIdx, etc. If cluster is present, then we also generate clusterIdx and
+ // clusterDim.
Region &outlinedFuncBody = outlinedFunc.getBody();
- injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+ injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
+ launchOp.hasClusterSize());
// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
for (const auto &[launchArg, funcArg] :
@@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
// The launch op has an optional dynamic shared memory size. If it doesn't
// exist, we use zero.
Value asyncToken = launchOp.getAsyncToken();
+ std::optional<gpu::KernelDim3> clusterSize =
+ launchOp.getClusterSizeOperandValues();
auto launchFunc = builder.create<gpu::LaunchFuncOp>(
launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
launchOp.getBlockSizeOperandValues(),
launchOp.getDynamicSharedMemorySize(), operands,
asyncToken ? asyncToken.getType() : nullptr,
- launchOp.getAsyncDependencies());
+ launchOp.getAsyncDependencies(), clusterSize);
launchOp.replaceAllUsesWith(launchFunc);
launchOp.erase();
}
diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
index a058365a104a1f..79eef8ae7eb856 100644
--- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
@@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
// CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index
// CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index
- // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
- // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+ // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+ // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
affine.for %i = 0 to 42 {
// CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
// CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 8a34d64326072b..4d3a898fdd1565 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) {
^bb1(%bx: index, %by: index, %bz: index,
%tx: index, %ty: index, %tz: index):
gpu.terminator
- }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
+ }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
return
}
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 8020f6dfa65b74..601add9a9f91c0 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -407,3 +407,77 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
}
// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// -----
+// CHECK: module attributes {gpu.container_module}
+
+// CHECK-LABEL: func @launch_cluster()
+func.func @launch_cluster() {
+ // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
+ %0 = "op"() : () -> (f32)
+ // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
+ %1 = "op"() : () -> (memref<?xf32, 1>)
+ // CHECK: %[[CDIMX:.*]] = arith.constant 1
+ %cDimX = arith.constant 1 : index
+ // CHECK: %[[CDIMY:.*]] = arith.constant 2
+ %cDimY = arith.constant 2 : index
+ // CHECK: %[[CDIMZ:.*]] = arith.constant 1
+ %cDimZ = arith.constant 1 : index
+ // CHECK: %[[GDIMX:.*]] = arith.constant 8
+ %gDimX = arith.constant 8 : index
+ // CHECK: %[[GDIMY:.*]] = arith.constant 12
+ %gDimY = arith.constant 12 : index
+ // CHECK: %[[GDIMZ:.*]] = arith.constant 16
+ %gDimZ = arith.constant 16 : index
+ // CHECK: %[[BDIMX:.*]] = arith.constant 20
+ %bDimX = arith.constant 20 : index
+ // CHECK: %[[BDIMY:.*]] = arith.constant 24
+ %bDimY = arith.constant 24 : index
+ // CHECK: %[[BDIMZ:.*]] = arith.constant 28
+ %bDimZ = arith.constant 28 : index
+
+ // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>)
+ // CHECK-NOT: gpu.launch blocks
+ gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY,
+ %cluster_z = %cDimZ)
+ blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+ %grid_z = %gDimZ)
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+ %block_z = %bDimZ) {
+ "use"(%0): (f32) -> ()
+ "some_op"(%cx, %bx, %block_x) : (index, index, index) -> ()
+ %42 = memref.load %1[%tx] : memref<?xf32, 1>
+ gpu.terminator
+ }
+ return
+}
+
+// CHECK-LABEL: gpu.module @launch_cluster_kernel
+// CHECK-NEXT: gpu.func @launch_cluster_kernel
+// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
+// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
+// CHECK-NEXT: = gpu.block_id y
+// CHECK-NEXT: = gpu.block_id z
+// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x
+// CHECK-NEXT: = gpu.thread_id y
+// CHECK-NEXT: = gpu.thread_id z
+// CHECK-NEXT: = gpu.grid_dim x
+// CHECK-NEXT: = gpu.grid_dim y
+// CHECK-NEXT: = gpu.grid_dim z
+// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
+// CHECK-NEXT: = gpu.block_dim y
+// CHECK-NEXT: = gpu.block_dim z
+// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x
+// CHECK-NEXT: = gpu.cluster_id y
+// CHECK-NEXT: = gpu.cluster_id z
+// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
+// CHECK-NEXT: = gpu.cluster_dim y
+// CHECK-NEXT: = gpu.cluster_dim z
+// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
+// CHECK-NEXT: ^[[BLOCK]]:
+// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
+// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
+// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
+
>From 500315765604672febe2b52994724bf844acb5cd Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 10:03:42 +0100
Subject: [PATCH 2/3] remove whitespace
---
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index a816d663596169..712ceefdda6fa5 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -676,7 +676,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
DeclareOpInterfaceMethods<InferIntRangeInterface>,
RecursiveMemoryEffects]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+ Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
Optional<Index>:$clusterSizeX,
Optional<Index>:$clusterSizeY,
>From dd95d78c4c432499e3128deffae9d2bb6395298c Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 15:33:38 +0100
Subject: [PATCH 3/3] add verifier and address @apaszke comments
---
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 1 +
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 9 ++++++++-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 712ceefdda6fa5..8d4a110ee801f0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -913,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
let hasCanonicalizer = 1;
let hasCustomAssemblyFormat = 1;
let hasRegionVerifier = 1;
+ let hasVerifier = 1;
}
def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 459b93cf033a0e..81326e67f3caad 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -750,6 +750,13 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
return KernelDim3{operands[6], operands[7], operands[8]};
}
+LogicalResult LaunchOp::verify() {
+ if (!(hasClusterSize()) &&
+ (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
+ return emitOpError() << "cluster size must be all present";
+ return success();
+}
+
LogicalResult LaunchOp::verifyRegions() {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -809,7 +816,7 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << " [" << getAsyncDependencies() << ']';
}
// Print the launch configuration.
- if (getClusterSizeX()) {
+ if (hasClusterSize()) {
p << ' ' << getClustersKeyword();
printSizeAssignment(p, getClusterSize().value(),
getClusterSizeOperandValues().value(),
More information about the Mlir-commits
mailing list