[Mlir-commits] [mlir] [mlir][gpu] Add Support for Cluster of Thread Blocks in `gpu.launch` (PR #76924)

Thu Jan 4 06:33:51 PST 2024

https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/76924

>From c1862bd473fcdf174e6b2dd310e3eb7b3ba3a7e5 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 10:01:25 +0100
Subject: [PATCH 1/3] [mlir][gpu] Add Support for CGA Clusters in `gpu.launch`

This PR improves `gpu.launch` to handle a new feature called cga cluster. Now, when using `gpu.launch`, one can include a cluster size, although it's opitional. If provided, the outliner will transform `gpu.launch` with the cluster size into `gpu.launch_func`.

Previously, PR #72871 introduced the required support for clusters in the MLIR compiler and its CUDA runtime. This PR builds upon that work.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 54 +++++++++++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 79 ++++++++++++++++---
 .../GPU/Transforms/KernelOutlining.cpp        | 20 +++--
 .../SCFToGPU/no_blocks_no_threads.mlir        |  4 +-
 mlir/test/Dialect/GPU/invalid.mlir            |  2 +-
 mlir/test/Dialect/GPU/outlining.mlir          | 74 +++++++++++++++++
 6 files changed, 210 insertions(+), 23 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index efef61b5c6e712..a816d663596169 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -676,8 +676,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       DeclareOpInterfaceMethods<InferIntRangeInterface>,
       RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,               
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+               Optional<Index>:$clusterSizeX, 
+               Optional<Index>:$clusterSizeY, 
+               Optional<Index>:$clusterSizeZ,
                Optional<I32>:$dynamicSharedMemorySize)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "GPU kernel launch operation";
@@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     to the amount of dynamic shared memory a kernel's workgroup should be
     allocated; when this operand is not present, a zero size is assumed.
 
-    The body region has at least _twelve_ arguments, grouped as follows:
+    The body region has at least _twelve_ arguments, or _eighteen_ if cluster 
+    dimensions are present, grouped as follows:
 
+    -   three optional arguments that contain cluster identifiers along x,y,z
+        dimensions;
     -   three arguments that contain block identifiers along x,y,z dimensions;
     -   three arguments that contain thread identifiers along x,y,z dimensions;
     -   operands of the `gpu.launch` operation as is (i.e. the operands for
@@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
 
     ```
     operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
+                             ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
                              `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
                              `threads` `(` ssa-id-list `)` `in` ssa-reassignment
                              (dynamic_shared_memory_size ssa-use)?
@@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       // Assuming %val1 is defined outside the gpu.launch region.
       %42 = load %workgroup[%bx] : memref<32xf32, 3>
     }
+
+    // Launch with clusters.
+    gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
+               blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
+               threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
+    {
+      // Cluster, block and thread identifiers, as well as cluster/block/grid 
+      // sizes are immediately usable inside body region.
+      "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
+    }
     ```
 
     Rationale: using operation/block arguments gives analyses a clear way of
@@ -784,7 +801,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       CArg<"Type", "nullptr">:$asyncTokenType,
       CArg<"ValueRange", "{}">:$asyncDependencies,
       CArg<"TypeRange", "{}">:$workgroupAttributions,
-      CArg<"TypeRange", "{}">:$privateAttributions)>
+      CArg<"TypeRange", "{}">:$privateAttributions,
+      CArg<"Value", "nullptr">:$clusterSizeX,
+      CArg<"Value", "nullptr">:$clusterSizeY,
+      CArg<"Value", "nullptr">:$clusterSizeZ)>
   ];
 
   let extraClassDeclaration = [{
@@ -792,17 +812,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     KernelDim3 getBlockIds();
     /// Get the SSA values corresponding to kernel thread identifiers.
     KernelDim3 getThreadIds();
+    /// Get the SSA values corresponding to kernel cluster identifiers.
+    std::optional<KernelDim3> getClusterIds();
     /// Get the SSA values corresponding to kernel grid size.
     KernelDim3 getGridSize();
     /// Get the SSA values corresponding to kernel block size.
     KernelDim3 getBlockSize();
+    /// Get the SSA values corresponding to kernel cluster size.
+    std::optional<KernelDim3> getClusterSize();
 
     /// Get the SSA values passed as operands to specify the grid size.
     KernelDim3 getGridSizeOperandValues();
     /// Get the SSA values passed as operands to specify the block size.
     KernelDim3 getBlockSizeOperandValues();
+    /// Get the SSA values passed as operands to specify the cluster size.
+    std::optional<KernelDim3> getClusterSizeOperandValues();
 
     static StringRef getBlocksKeyword() { return "blocks"; }
+    static StringRef getClustersKeyword() { return "clusters"; }
     static StringRef getThreadsKeyword() { return "threads"; }
     static StringRef getDynamicSharedMemorySizeKeyword() {
       return "dynamic_shared_memory_size";
@@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     /// placed in the leading positions of the argument list.
     static constexpr unsigned kNumConfigRegionAttributes = 12;
 
+    /// Returns true if cluster size is specified.
+    bool hasClusterSize() {
+      if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
+        return true;
+      return false;
+    }
+    /// Returns the number of operands including cluster size
+    unsigned getNumConfigOperands() {
+      return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
+    }
+    /// Returns the number of region attributes including cluster size 
+    unsigned getNumConfigRegionAttributes() {
+      return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
+    }
+
     /// Returns the keywords used in the custom syntax for this Op.
     static StringRef getWorkgroupKeyword() { return "workgroup"; }
     static StringRef getPrivateKeyword() { return "private"; }
@@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     /// the workgroup memory
     ArrayRef<BlockArgument> getWorkgroupAttributions() {
       auto begin =
-          std::next(getBody().args_begin(), kNumConfigRegionAttributes);
+          std::next(getBody().args_begin(), getNumConfigRegionAttributes());
       auto end = std::next(begin, getNumWorkgroupAttributions());
       return {begin, end};
     }
@@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
 
     /// Returns the number of buffers located in the private memory.
     unsigned getNumPrivateAttributions() {
-      return getBody().getNumArguments() - kNumConfigRegionAttributes -
+      return getBody().getNumArguments() - getNumConfigRegionAttributes() -
           getNumWorkgroupAttributions();
     }
 
@@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       // memory.
       auto begin =
           std::next(getBody().args_begin(),
-                    kNumConfigRegionAttributes + getNumWorkgroupAttributions());
+                    getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
       return {begin, getBody().args_end()};
     }
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index dd482f305fcbc8..459b93cf033a0e 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                      Value getBlockSizeZ, Value dynamicSharedMemorySize,
                      Type asyncTokenType, ValueRange asyncDependencies,
                      TypeRange workgroupAttributions,
-                     TypeRange privateAttributions) {
+                     TypeRange privateAttributions, Value clusterSizeX,
+                     Value clusterSizeY, Value clusterSizeZ) {
   // Add a WorkGroup attribution attribute. This attribute is required to
   // identify private attributions in the list of block argguments.
   result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -660,6 +661,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
   // Add grid and block sizes as op operands, followed by the data operands.
   result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
                       getBlockSizeY, getBlockSizeZ});
+  if (clusterSizeX && clusterSizeY && clusterSizeZ)
+    result.addOperands({clusterSizeX, clusterSizeY, clusterSizeZ});
   if (dynamicSharedMemorySize)
     result.addOperands(dynamicSharedMemorySize);
 
@@ -678,9 +681,14 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
     body->addArgument(argTy, result.location);
   kernelRegion->push_back(body);
   // Fill OperandSegmentSize Attribute.
-  SmallVector<int32_t, 8> segmentSizes(8, 1);
+  SmallVector<int32_t, 11> segmentSizes(11, 1);
   segmentSizes.front() = asyncDependencies.size();
   segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
+  if (!clusterSizeX) {
+    segmentSizes[7] = 0;
+    segmentSizes[8] = 0;
+    segmentSizes[9] = 0;
+  }
   result.addAttribute(getOperandSegmentSizeAttr(),
                       builder.getDenseI32ArrayAttr(segmentSizes));
 }
@@ -709,6 +717,22 @@ KernelDim3 LaunchOp::getBlockSize() {
   return KernelDim3{args[9], args[10], args[11]};
 }
 
+std::optional<KernelDim3> LaunchOp::getClusterIds() {
+  assert(!getBody().empty() && "LaunchOp body must not be empty.");
+  if (!hasClusterSize())
+    return std::nullopt;
+  auto args = getBody().getArguments();
+  return KernelDim3{args[12], args[13], args[14]};
+}
+
+std::optional<KernelDim3> LaunchOp::getClusterSize() {
+  assert(!getBody().empty() && "LaunchOp body must not be empty.");
+  if (!hasClusterSize())
+    return std::nullopt;
+  auto args = getBody().getArguments();
+  return KernelDim3{args[15], args[16], args[17]};
+}
+
 KernelDim3 LaunchOp::getGridSizeOperandValues() {
   auto operands = getOperands().drop_front(getAsyncDependencies().size());
   return KernelDim3{operands[0], operands[1], operands[2]};
@@ -719,6 +743,13 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() {
   return KernelDim3{operands[3], operands[4], operands[5]};
 }
 
+std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
+  auto operands = getOperands().drop_front(getAsyncDependencies().size());
+  if (!hasClusterSize())
+    return std::nullopt;
+  return KernelDim3{operands[6], operands[7], operands[8]};
+}
+
 LogicalResult LaunchOp::verifyRegions() {
   // Kernel launch takes kNumConfigOperands leading operands for grid/block
   // sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -778,6 +809,12 @@ void LaunchOp::print(OpAsmPrinter &p) {
       p << " [" << getAsyncDependencies() << ']';
   }
   // Print the launch configuration.
+  if (getClusterSizeX()) {
+    p << ' ' << getClustersKeyword();
+    printSizeAssignment(p, getClusterSize().value(),
+                        getClusterSizeOperandValues().value(),
+                        getClusterIds().value());
+  }
   p << ' ' << getBlocksKeyword();
   printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
                       getBlockIds());
@@ -831,6 +868,7 @@ parseSizeAssignment(OpAsmParser &parser,
 
 /// Parses a Launch operation.
 /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
+///       `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional)
 ///       `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
 ///       `threads` `(` ssa-id-list `)` `in` ssa-reassignment
 ///       memory-attribution
@@ -840,7 +878,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // Sizes of the grid and block.
   SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
       sizes(LaunchOp::kNumConfigOperands);
-  MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
 
   // Actual (data) operands passed to the kernel.
   SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;
@@ -848,7 +885,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // Region arguments to be created.
   SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
       LaunchOp::kNumConfigRegionAttributes);
-  MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
 
   // Parse optional async dependencies.
   SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
@@ -861,6 +897,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   if (parser.getNumResults() > 0)
     result.types.push_back(asyncTokenType);
 
+  bool hasCluster = false;
+  if (succeeded(
+          parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) {
+    hasCluster = true;
+    sizes.resize(9);
+    regionArgs.resize(18);
+  }
+  MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
+  MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
+
+  // Last three segment assigns the cluster size. In the region argument
+  // list, this is last 6 arguments.
+  if (hasCluster) {
+    if (parseSizeAssignment(parser, sizesRef.drop_front(6),
+                            regionArgsRef.slice(15, 3),
+                            regionArgsRef.slice(12, 3)))
+      return failure();
+  }
   // Parse the size assignment segments: the first segment assigns grid sizes
   // and defines values for block identifiers; the second segment assigns block
   // sizes and defines values for thread identifiers.  In the region argument
@@ -898,7 +952,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   // LaunchOp::getNumWorkgroupAttributionsAttrName().
   Type index = parser.getBuilder().getIndexType();
   SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
-      LaunchOp::kNumConfigRegionAttributes, index);
+      LaunchOp::kNumConfigRegionAttributes + 6, index);
 
   SmallVector<OpAsmParser::Argument> regionArguments;
   for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
@@ -916,8 +970,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
 
   // Store the number of operands we just parsed as the number of workgroup
   // memory attributions.
-  unsigned numWorkgroupAttrs =
-      regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
+  unsigned numWorkgroupAttrs = regionArguments.size() -
+                               LaunchOp::kNumConfigRegionAttributes -
+                               (hasCluster ? 6 : 0);
   result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
                       builder.getI64IntegerAttr(numWorkgroupAttrs));
 
@@ -934,8 +989,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.parseOptionalAttrDict(result.attributes))
     return failure();
 
-  SmallVector<int32_t, 8> segmentSizes(8, 1);
+  SmallVector<int32_t, 11> segmentSizes(11, 1);
   segmentSizes.front() = asyncDependencies.size();
+
+  if (!hasCluster) {
+    segmentSizes[7] = 0;
+    segmentSizes[8] = 0;
+    segmentSizes[9] = 0;
+  }
   segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
   result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
                       parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
@@ -992,7 +1053,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
   (*this)->setAttr(attrName,
                    IntegerAttr::get(attr.getType(), attr.getValue() + 1));
   return getBody().insertArgument(
-      LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
+      LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
 }
 
 /// Adds a new block argument that corresponds to buffers located in
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 7432a58f18b442..2436113dc4239c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
 /// entry block of `launchOpBody`, to the corresponding result value of the
 /// added operations.
 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
-                                     Region &launchOpBody, IRMapping &map) {
+                                     Region &launchOpBody, IRMapping &map,
+                                     bool hasCluster = false) {
   OpBuilder builder(loc->getContext());
   Block &firstBlock = launchOpBody.front();
   builder.setInsertionPointToStart(&launchFuncOpBody.front());
-  SmallVector<Value, 12> indexOps;
+  SmallVector<Value> indexOps;
+  // The order is important here, as it must match the order of the arguments
   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
+  if (hasCluster) {
+    createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
+    createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
+  }
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
   for (const auto &indexOp : enumerate(indexOps))
@@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   IRMapping map;
 
   // Map the arguments corresponding to the launch parameters like blockIdx,
-  // threadIdx, etc.
+  // threadIdx, etc. If cluster is present, then we also generate clusterIdx and
+  // clusterDim.
   Region &outlinedFuncBody = outlinedFunc.getBody();
-  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
+                           launchOp.hasClusterSize());
 
   // Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
   for (const auto &[launchArg, funcArg] :
@@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
   // The launch op has an optional dynamic shared memory size. If it doesn't
   // exist, we use zero.
   Value asyncToken = launchOp.getAsyncToken();
+  std::optional<gpu::KernelDim3> clusterSize =
+      launchOp.getClusterSizeOperandValues();
   auto launchFunc = builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(),
       launchOp.getDynamicSharedMemorySize(), operands,
       asyncToken ? asyncToken.getType() : nullptr,
-      launchOp.getAsyncDependencies());
+      launchOp.getAsyncDependencies(), clusterSize);
   launchOp.replaceAllUsesWith(launchFunc);
   launchOp.erase();
 }
diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
index a058365a104a1f..79eef8ae7eb856 100644
--- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
@@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
   // CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index
   // CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index
 
-  // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
-  // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+  // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+  // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
   affine.for %i = 0 to 42 {
   // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
   // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 8a34d64326072b..4d3a898fdd1565 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) {
   ^bb1(%bx: index, %by: index, %bz: index,
        %tx: index, %ty: index, %tz: index):
     gpu.terminator
-  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 8020f6dfa65b74..601add9a9f91c0 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -407,3 +407,77 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
 }
 
 // CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// -----
+// CHECK: module attributes {gpu.container_module}
+
+// CHECK-LABEL: func @launch_cluster()
+func.func @launch_cluster() {
+  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
+  %0 = "op"() : () -> (f32)
+  // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  // CHECK: %[[CDIMX:.*]] = arith.constant 1
+  %cDimX = arith.constant 1 : index
+  // CHECK: %[[CDIMY:.*]] = arith.constant 2
+  %cDimY = arith.constant 2 : index
+  // CHECK: %[[CDIMZ:.*]] = arith.constant 1
+  %cDimZ = arith.constant 1 : index
+  // CHECK: %[[GDIMX:.*]] = arith.constant 8
+  %gDimX = arith.constant 8 : index
+  // CHECK: %[[GDIMY:.*]] = arith.constant 12
+  %gDimY = arith.constant 12 : index
+  // CHECK: %[[GDIMZ:.*]] = arith.constant 16
+  %gDimZ = arith.constant 16 : index
+  // CHECK: %[[BDIMX:.*]] = arith.constant 20
+  %bDimX = arith.constant 20 : index
+  // CHECK: %[[BDIMY:.*]] = arith.constant 24
+  %bDimY = arith.constant 24 : index
+  // CHECK: %[[BDIMZ:.*]] = arith.constant 28
+  %bDimZ = arith.constant 28 : index
+
+  // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>)
+  // CHECK-NOT: gpu.launch blocks
+  gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY,
+                                       %cluster_z = %cDimZ)
+             blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+                                       %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+                                        %block_z = %bDimZ) {
+    "use"(%0): (f32) -> ()
+    "some_op"(%cx, %bx, %block_x) : (index, index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: gpu.module @launch_cluster_kernel
+// CHECK-NEXT: gpu.func @launch_cluster_kernel
+// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
+// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
+// CHECK-NEXT: = gpu.block_id y
+// CHECK-NEXT: = gpu.block_id z
+// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x
+// CHECK-NEXT: = gpu.thread_id y
+// CHECK-NEXT: = gpu.thread_id z
+// CHECK-NEXT: = gpu.grid_dim x
+// CHECK-NEXT: = gpu.grid_dim y
+// CHECK-NEXT: = gpu.grid_dim z
+// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
+// CHECK-NEXT: = gpu.block_dim y
+// CHECK-NEXT: = gpu.block_dim z
+// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x
+// CHECK-NEXT: = gpu.cluster_id y
+// CHECK-NEXT: = gpu.cluster_id z
+// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
+// CHECK-NEXT: = gpu.cluster_dim y
+// CHECK-NEXT: = gpu.cluster_dim z
+// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
+// CHECK-NEXT: ^[[BLOCK]]:
+// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
+// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
+// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
+

>From 500315765604672febe2b52994724bf844acb5cd Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 10:03:42 +0100
Subject: [PATCH 2/3] remove whitespace

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index a816d663596169..712ceefdda6fa5 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -676,7 +676,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       DeclareOpInterfaceMethods<InferIntRangeInterface>,
       RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,               
+               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
                Optional<Index>:$clusterSizeX, 
                Optional<Index>:$clusterSizeY, 

>From dd95d78c4c432499e3128deffae9d2bb6395298c Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Thu, 4 Jan 2024 15:33:38 +0100
Subject: [PATCH 3/3] add verifier and address @apaszke comments

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 1 +
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp     | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 712ceefdda6fa5..8d4a110ee801f0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -913,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
   let hasCanonicalizer = 1;
   let hasCustomAssemblyFormat = 1;
   let hasRegionVerifier = 1;
+  let hasVerifier = 1;
 }
 
 def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 459b93cf033a0e..81326e67f3caad 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -750,6 +750,13 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+LogicalResult LaunchOp::verify() {
+  if (!(hasClusterSize()) &&
+      (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
+    return emitOpError() << "cluster size must be all present";
+  return success();
+}
+
 LogicalResult LaunchOp::verifyRegions() {
   // Kernel launch takes kNumConfigOperands leading operands for grid/block
   // sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -809,7 +816,7 @@ void LaunchOp::print(OpAsmPrinter &p) {
       p << " [" << getAsyncDependencies() << ']';
   }
   // Print the launch configuration.
-  if (getClusterSizeX()) {
+  if (hasClusterSize()) {
     p << ' ' << getClustersKeyword();
     printSizeAssignment(p, getClusterSize().value(),
                         getClusterSizeOperandValues().value(),