[Mlir-commits] [mlir] [mlir][gpu] Add Support for Cluster of Thread Blocks in `gpu.launch` (PR #76924)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Jan 4 01:04:34 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Guray Ozen (grypp)
<details>
<summary>Changes</summary>
This PR improves `gpu.launch` to handle a new feature called cga cluster. Now, when using `gpu.launch`, one can include a cluster size, although it's optional. If provided, the outliner will transform `gpu.launch` with the cluster size into `gpu.launch_func`.
Previously, PR #<!-- -->72871 introduced the required support for clusters in the MLIR compiler and its CUDA runtime. This PR builds upon that work.
---
Patch is 24.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/76924.diff
6 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+48-6)
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+70-9)
- (modified) mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp (+15-5)
- (modified) mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir (+2-2)
- (modified) mlir/test/Dialect/GPU/invalid.mlir (+1-1)
- (modified) mlir/test/Dialect/GPU/outlining.mlir (+74)
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index efef61b5c6e712..a816d663596169 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -676,8 +676,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
DeclareOpInterfaceMethods<InferIntRangeInterface>,
RecursiveMemoryEffects]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
- Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+ Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+ Optional<Index>:$clusterSizeX,
+ Optional<Index>:$clusterSizeY,
+ Optional<Index>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "GPU kernel launch operation";
@@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
to the amount of dynamic shared memory a kernel's workgroup should be
allocated; when this operand is not present, a zero size is assumed.
- The body region has at least _twelve_ arguments, grouped as follows:
+ The body region has at least _twelve_ arguments, or _eighteen_ if cluster
+ dimensions are present, grouped as follows:
+ - three optional arguments that contain cluster identifiers along x,y,z
+ dimensions;
- three arguments that contain block identifiers along x,y,z dimensions;
- three arguments that contain thread identifiers along x,y,z dimensions;
- operands of the `gpu.launch` operation as is (i.e. the operands for
@@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
```
operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
+ ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
`blocks` `(` ssa-id-list `)` `in` ssa-reassignment
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
(dynamic_shared_memory_size ssa-use)?
@@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// Assuming %val1 is defined outside the gpu.launch region.
%42 = load %workgroup[%bx] : memref<32xf32, 3>
}
+
+ // Launch with clusters.
+ gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
+ blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
+ threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
+ {
+ // Cluster, block and thread identifiers, as well as cluster/block/grid
+ // sizes are immediately usable inside body region.
+ "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
+ }
```
Rationale: using operation/block arguments gives analyses a clear way of
@@ -784,7 +801,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies,
CArg<"TypeRange", "{}">:$workgroupAttributions,
- CArg<"TypeRange", "{}">:$privateAttributions)>
+ CArg<"TypeRange", "{}">:$privateAttributions,
+ CArg<"Value", "nullptr">:$clusterSizeX,
+ CArg<"Value", "nullptr">:$clusterSizeY,
+ CArg<"Value", "nullptr">:$clusterSizeZ)>
];
let extraClassDeclaration = [{
@@ -792,17 +812,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [
KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();
+ /// Get the SSA values corresponding to kernel cluster identifiers.
+ std::optional<KernelDim3> getClusterIds();
/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();
+ /// Get the SSA values corresponding to kernel cluster size.
+ std::optional<KernelDim3> getClusterSize();
/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();
+ /// Get the SSA values passed as operands to specify the cluster size.
+ std::optional<KernelDim3> getClusterSizeOperandValues();
static StringRef getBlocksKeyword() { return "blocks"; }
+ static StringRef getClustersKeyword() { return "clusters"; }
static StringRef getThreadsKeyword() { return "threads"; }
static StringRef getDynamicSharedMemorySizeKeyword() {
return "dynamic_shared_memory_size";
@@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// placed in the leading positions of the argument list.
static constexpr unsigned kNumConfigRegionAttributes = 12;
+ /// Returns true if cluster size is specified.
+ bool hasClusterSize() {
+ if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
+ return true;
+ return false;
+ }
+ /// Returns the number of operands including cluster size
+ unsigned getNumConfigOperands() {
+ return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
+ }
+ /// Returns the number of region attributes including cluster size
+ unsigned getNumConfigRegionAttributes() {
+ return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
+ }
+
/// Returns the keywords used in the custom syntax for this Op.
static StringRef getWorkgroupKeyword() { return "workgroup"; }
static StringRef getPrivateKeyword() { return "private"; }
@@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// the workgroup memory
ArrayRef<BlockArgument> getWorkgroupAttributions() {
auto begin =
- std::next(getBody().args_begin(), kNumConfigRegionAttributes);
+ std::next(getBody().args_begin(), getNumConfigRegionAttributes());
auto end = std::next(begin, getNumWorkgroupAttributions());
return {begin, end};
}
@@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// Returns the number of buffers located in the private memory.
unsigned getNumPrivateAttributions() {
- return getBody().getNumArguments() - kNumConfigRegionAttributes -
+ return getBody().getNumArguments() - getNumConfigRegionAttributes() -
getNumWorkgroupAttributions();
}
@@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
// memory.
auto begin =
std::next(getBody().args_begin(),
- kNumConfigRegionAttributes + getNumWorkgroupAttributions());
+ getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
return {begin, getBody().args_end()};
}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index dd482f305fcbc8..459b93cf033a0e 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
Value getBlockSizeZ, Value dynamicSharedMemorySize,
Type asyncTokenType, ValueRange asyncDependencies,
TypeRange workgroupAttributions,
- TypeRange privateAttributions) {
+ TypeRange privateAttributions, Value clusterSizeX,
+ Value clusterSizeY, Value clusterSizeZ) {
// Add a WorkGroup attribution attribute. This attribute is required to
// identify private attributions in the list of block argguments.
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -660,6 +661,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
getBlockSizeY, getBlockSizeZ});
+ if (clusterSizeX && clusterSizeY && clusterSizeZ)
+ result.addOperands({clusterSizeX, clusterSizeY, clusterSizeZ});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
@@ -678,9 +681,14 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
body->addArgument(argTy, result.location);
kernelRegion->push_back(body);
// Fill OperandSegmentSize Attribute.
- SmallVector<int32_t, 8> segmentSizes(8, 1);
+ SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
+ if (!clusterSizeX) {
+ segmentSizes[7] = 0;
+ segmentSizes[8] = 0;
+ segmentSizes[9] = 0;
+ }
result.addAttribute(getOperandSegmentSizeAttr(),
builder.getDenseI32ArrayAttr(segmentSizes));
}
@@ -709,6 +717,22 @@ KernelDim3 LaunchOp::getBlockSize() {
return KernelDim3{args[9], args[10], args[11]};
}
+std::optional<KernelDim3> LaunchOp::getClusterIds() {
+ assert(!getBody().empty() && "LaunchOp body must not be empty.");
+ if (!hasClusterSize())
+ return std::nullopt;
+ auto args = getBody().getArguments();
+ return KernelDim3{args[12], args[13], args[14]};
+}
+
+std::optional<KernelDim3> LaunchOp::getClusterSize() {
+ assert(!getBody().empty() && "LaunchOp body must not be empty.");
+ if (!hasClusterSize())
+ return std::nullopt;
+ auto args = getBody().getArguments();
+ return KernelDim3{args[15], args[16], args[17]};
+}
+
KernelDim3 LaunchOp::getGridSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
@@ -719,6 +743,13 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() {
return KernelDim3{operands[3], operands[4], operands[5]};
}
+std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
+ auto operands = getOperands().drop_front(getAsyncDependencies().size());
+ if (!hasClusterSize())
+ return std::nullopt;
+ return KernelDim3{operands[6], operands[7], operands[8]};
+}
+
LogicalResult LaunchOp::verifyRegions() {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -778,6 +809,12 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << " [" << getAsyncDependencies() << ']';
}
// Print the launch configuration.
+ if (getClusterSizeX()) {
+ p << ' ' << getClustersKeyword();
+ printSizeAssignment(p, getClusterSize().value(),
+ getClusterSizeOperandValues().value(),
+ getClusterIds().value());
+ }
p << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
getBlockIds());
@@ -831,6 +868,7 @@ parseSizeAssignment(OpAsmParser &parser,
/// Parses a Launch operation.
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
+/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional)
/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
/// memory-attribution
@@ -840,7 +878,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
sizes(LaunchOp::kNumConfigOperands);
- MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
// Actual (data) operands passed to the kernel.
SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;
@@ -848,7 +885,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Region arguments to be created.
SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
LaunchOp::kNumConfigRegionAttributes);
- MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
// Parse optional async dependencies.
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
@@ -861,6 +897,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
if (parser.getNumResults() > 0)
result.types.push_back(asyncTokenType);
+ bool hasCluster = false;
+ if (succeeded(
+ parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) {
+ hasCluster = true;
+ sizes.resize(9);
+ regionArgs.resize(18);
+ }
+ MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
+ MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
+
+ // Last three segment assigns the cluster size. In the region argument
+ // list, this is last 6 arguments.
+ if (hasCluster) {
+ if (parseSizeAssignment(parser, sizesRef.drop_front(6),
+ regionArgsRef.slice(15, 3),
+ regionArgsRef.slice(12, 3)))
+ return failure();
+ }
// Parse the size assignment segments: the first segment assigns grid sizes
// and defines values for block identifiers; the second segment assigns block
// sizes and defines values for thread identifiers. In the region argument
@@ -898,7 +952,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// LaunchOp::getNumWorkgroupAttributionsAttrName().
Type index = parser.getBuilder().getIndexType();
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
- LaunchOp::kNumConfigRegionAttributes, index);
+ LaunchOp::kNumConfigRegionAttributes + 6, index);
SmallVector<OpAsmParser::Argument> regionArguments;
for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
@@ -916,8 +970,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
- unsigned numWorkgroupAttrs =
- regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
+ unsigned numWorkgroupAttrs = regionArguments.size() -
+ LaunchOp::kNumConfigRegionAttributes -
+ (hasCluster ? 6 : 0);
result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(numWorkgroupAttrs));
@@ -934,8 +989,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
parser.parseOptionalAttrDict(result.attributes))
return failure();
- SmallVector<int32_t, 8> segmentSizes(8, 1);
+ SmallVector<int32_t, 11> segmentSizes(11, 1);
segmentSizes.front() = asyncDependencies.size();
+
+ if (!hasCluster) {
+ segmentSizes[7] = 0;
+ segmentSizes[8] = 0;
+ segmentSizes[9] = 0;
+ }
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
@@ -992,7 +1053,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
(*this)->setAttr(attrName,
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
return getBody().insertArgument(
- LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
+ LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 7432a58f18b442..2436113dc4239c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
/// entry block of `launchOpBody`, to the corresponding result value of the
/// added operations.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
- Region &launchOpBody, IRMapping &map) {
+ Region &launchOpBody, IRMapping &map,
+ bool hasCluster = false) {
OpBuilder builder(loc->getContext());
Block &firstBlock = launchOpBody.front();
builder.setInsertionPointToStart(&launchFuncOpBody.front());
- SmallVector<Value, 12> indexOps;
+ SmallVector<Value> indexOps;
+ // The order is important here, as it must match the order of the arguments
createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
+ if (hasCluster) {
+ createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
+ createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
+ }
// Replace the leading 12 function args with the respective thread/block index
// operations. Iterate backwards since args are erased and indices change.
for (const auto &indexOp : enumerate(indexOps))
@@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
IRMapping map;
// Map the arguments corresponding to the launch parameters like blockIdx,
- // threadIdx, etc.
+ // threadIdx, etc. If cluster is present, then we also generate clusterIdx and
+ // clusterDim.
Region &outlinedFuncBody = outlinedFunc.getBody();
- injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+ injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
+ launchOp.hasClusterSize());
// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
for (const auto &[launchArg, funcArg] :
@@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
// The launch op has an optional dynamic shared memory size. If it doesn't
// exist, we use zero.
Value asyncToken = launchOp.getAsyncToken();
+ std::optional<gpu::KernelDim3> clusterSize =
+ launchOp.getClusterSizeOperandValues();
auto launchFunc = builder.create<gpu::LaunchFuncOp>(
launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
launchOp.getBlockSizeOperandValues(),
launchOp.getDynamicSharedMemorySize(), operands,
asyncToken ? asyncToken.getType() : nullptr,
- launchOp.getAsyncDependencies());
+ launchOp.getAsyncDependencies(), clusterSize);
launchOp.replaceAllUsesWith(launchFunc);
launchOp.erase();
}
diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
index a058365a104a1f..79eef8ae7eb856 100644
--- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
@@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
// CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index
// CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index
- // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
- // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+ // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
+ // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
affine.for %i = 0 to 42 {
// CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
// CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 8a3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/76924
More information about the Mlir-commits
mailing list