[Mlir-commits] [mlir] e68a7be - [mlir][transform] Add failing test for GPU transform dialect
Guray Ozen
llvmlistbot at llvm.org
Wed Oct 5 04:10:23 PDT 2022
Author: Guray Ozen
Date: 2022-10-05T13:10:13+02:00
New Revision: e68a7bed599708409602316346c2f6fa3e500d8e
URL: https://github.com/llvm/llvm-project/commit/e68a7bed599708409602316346c2f6fa3e500d8e
DIFF: https://github.com/llvm/llvm-project/commit/e68a7bed599708409602316346c2f6fa3e500d8e.diff
LOG: [mlir][transform] Add failing test for GPU transform dialect
The GPU transform dialect currently has restrictions and several situations where we can't use transform dialect.
This update includes a method to test a failing cases in GPU transform dialect.
Differential Revision: https://reviews.llvm.org/D135063
Added:
mlir/test/Dialect/GPU/transform-gpu-failing.mlir
Modified:
mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 3c1ef3df840d2..54f0573a70daf 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -365,8 +365,9 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
llvm::zip(threadOps, blockDim, globalBlockDims)) {
if (blockDim > globalBlockDim) {
return failureHelper(
- "The GPU threads are fewer than the loop trip counts. "
- "Try to tile scf.foreach_thread before mapping.");
+ "The requested GPU threads are fewer than the number of loop trip "
+ "counts. Try to tile scf.foreach_thread before mapping or set small "
+ "blockDim.");
}
if (blockDim == globalBlockDim)
continue;
@@ -464,7 +465,7 @@ DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
rewriter.setInsertionPoint(target);
diag = mlir::transform::gpu::mapNestedForeachToThreadsImpl(
- rewriter, target, blockDim, getSyncAfterDistribute(), llvm::None);
+ rewriter, target, blockDim, getSyncAfterDistribute(), transformOp);
if (diag.succeeded()) {
diag =
alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None,
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
new file mode 100644
index 0000000000000..dc0c3230f0165
--- /dev/null
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -0,0 +1,290 @@
+// RUN: mlir-opt --test-transform-dialect-interpreter --split-input-file -canonicalize -cse --verify-diagnostics %s
+
+func.func @map_nested_foreach_to_threads_not_gpu_launch() -> () {
+ %1 = tensor.empty() : tensor<4xf32>
+ return
+}
+transform.sequence failures(propagate) {
+^bb0(%arg0: !pdl.operation):
+ %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0
+ // expected-error @below {{Given target is not gpu.launch}}
+ %1 = transform.gpu.map_nested_foreach_to_threads %funcop
+}
+
+// -----
+
+func.func @map_nested_foreach_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c900 = arith.constant 900 : index
+ %c9 = arith.constant 9 : index
+ %c7 = arith.constant 7 : index
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c900) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+
+ %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c9) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+
+ return %y : memref<2 x 32 x f32>
+}
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+ transform.sequence %arg0 failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{Trying to launch a GPU kernel with gridDim = (1, 1, 1) blockDim = (1200, 9, 1). It is larger than the limits.}}
+ // expected-note @below {{"blockDim" is very large}}
+ transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [1200, 9, 1] }
+ }
+}
+
+// -----
+
+func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c900 = arith.constant 900 : index
+ %c9 = arith.constant 9 : index
+ %c7 = arith.constant 7 : index
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c900) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+
+ %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c9) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+
+ return %y : memref<2 x 32 x f32>
+}
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+ transform.sequence %arg0 failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{The requested GPU threads are fewer than the number of loop trip counts. Try to tile scf.foreach_thread before mapping or set small blockDim.}}
+ transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
+ }
+}
+
+// -----
+
+func.func @map_nested_foreach_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c900 = arith.constant 900 : index
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c900) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+ return %y : memref<2 x 32 x f32>
+}
+
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+ transform.sequence %arg0 failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{unsupported dynamic blockdim size}}
+ transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
+ }
+}
+
+// -----
+
+func.func @map_nested_foreach_to_threads_4d_loop(%x: memref<2x32x32x32xf32>, %y: memref<2x32x32x32xf32>, %stream : !gpu.async.token) -> memref<2x32x32x32xf32> {
+ %one = arith.constant 1 : index
+ %c2 = arith.constant 1 : index
+ %c32 = arith.constant 32 : index
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j, %k, %l) in (%c2, %c32,%c32,%c32) {
+ %4 = memref.load %x[%i, %j, %k, %l] : memref<2x32x32x32xf32>
+ memref.store %4, %y[%i, %j, %k, %l] : memref<2x32x32x32xf32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+ return %y : memref<2x32x32x32xf32>
+}
+
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+ transform.sequence %arg0 failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{scf.foreach_thread with rank > 3 does not lower to gpu.thread_id}}
+ transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
+ }
+}
+
+// -----
+
+func.func @map_nested_foreach_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {
+ %one = arith.constant 1 : index
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>
+ gpu.terminator
+ }
+ return
+}
+
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+ transform.sequence %arg0 failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0
+ %foreach, %tiled = transform.structured.tile_to_foreach_thread_op %matmul num_threads [10, 20, 30]
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{only bufferized scf.foreach_thread lowers to gpu.thread_id}}
+ transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
+ }
+}
+
+// -----
+
+
+func.func @map_foreach_to_blocks_not_gpu_launch() -> () {
+ // expected-note @below {{when applied to this payload op}}
+ %1 = tensor.empty() : tensor<4xf32>
+ return
+}
+transform.sequence failures(propagate) {
+^bb0(%arg0: !pdl.operation):
+ %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0
+ // expected-error @below {{Given target is not gpu.launch}}
+ %1 = transform.gpu.map_foreach_to_blocks %funcop
+}
+
+// -----
+
+func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c900 = arith.constant 900 : index
+ %c9 = arith.constant 9 : index
+ %c7 = arith.constant 7 : index
+ // expected-note @below {{when applied to this payload op}}
+ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+ threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+ {
+ scf.foreach_thread (%i, %j) in (%c7, %c900) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+
+ scf.foreach_thread (%i, %j) in (%c7, %c9) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+ gpu.terminator
+ }
+
+ return %y : memref<2 x 32 x f32>
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !pdl.operation):
+ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+ // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+ %1 = transform.gpu.map_foreach_to_blocks %funcop
+}
+
+// -----
+
+// expected-note @below {{when applied to this payload op}}
+func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c65537 = arith.constant 65536 : index
+ %c9 = arith.constant 9 : index
+ %c7 = arith.constant 7 : index
+
+ scf.foreach_thread (%i, %j) in (%c7, %c65537) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [0, 1, 2]}
+
+ scf.foreach_thread (%i, %j) in (%c7, %c9) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [1, 0, 2]}
+
+ return %y : memref<2 x 32 x f32>
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !pdl.operation):
+ %funcop = transform.structured.match ops{["func.func"]} in %arg0
+ // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+ %1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
+}
+
+// -----
+
+func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c65535 = arith.constant 65535 : index
+ scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } {thread_dim_mapping = [0, 1, 2]}
+ return %y : memref<2 x 32 x f32>
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !pdl.operation):
+ %funcop = transform.structured.match ops{["func.func"]} in %arg0
+ // expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}}
+ %1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
+}
+
+// -----
+
More information about the Mlir-commits
mailing list