[Mlir-commits] [mlir] [mlir][affine]make threadid op is valid symbol. (PR #118478)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Dec 3 04:05:32 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: lonely eagle (linuxlonelyeagle)
<details>
<summary>Changes</summary>
As title.
Consider the following code:
```
#map = affine_map<()[s0] -> (s0 mod 32)>
module {
gpu.module @<!-- -->gpu {
gpu.func @<!-- -->gemm(%arg0: memref<?x?xf32>) kernel {
%c3 = arith.constant 3 : index
%dim = memref.dim %arg0, %c3 : memref<?x?xf32>
%c0 = arith.constant 0 : index
%0 = affine.apply #map()[%thread_id_x]
affine.for %arg3 = %c0 to %dim step 32 {
%thread_id_x = gpu.thread_id x
%0 = affine.apply #map()[%thread_id_x]
%c128 = arith.constant 128 : index
affine.for %arg4 = %0 to %c128 step 8 {
%c32 = arith.constant 32 : index
}
}
gpu.return
}
```
The code above is fine.The following code causes problems.The reason is that affine.for does not have `AffineScope`.But affine.for should not be able to introduce `AffineScope`.
```
#map = affine_map<()[s0] -> (s0 mod 32)>
module {
gpu.module @<!-- -->gpu {
gpu.func @<!-- -->gemm(%arg0: memref<?x?xf32>) kernel {
%c3 = arith.constant 3 : index
%dim = memref.dim %arg0, %c3 : memref<?x?xf32>
%c0 = arith.constant 0 : index
affine.for %arg3 = %c0 to %dim step 32 {
%thread_id_x = gpu.thread_id x
%0 = affine.apply #map()[%thread_id_x]
%c128 = arith.constant 128 : index
affine.for %arg4 = %0 to %c128 step 8 {
%c32 = arith.constant 32 : index
}
}
gpu.return
}
}
}
```
Why do we need to do this?
```
module {
gpu.module @<!-- -->gpu {
gpu.func @<!-- -->gemm(%arg0: memref<?x?xf32>) kernel {
%c3 = arith.constant 3 : index
%dim = memref.dim %arg0, %c3 : memref<?x?xf32>
%c0 = arith.constant 0 : index
affine.for %arg3 = %c0 to %dim step 32 {
//thread load op.
}
gpu.return
}
}
}
```
Here is the result of `a separate thread loadOp` (after lower, although there is no `memref.load` in it yet).The importance of having the `threadid` as a legal symbol is demonstrated here, although I could have put the op in the Regon of the `funcOp`, but that would have added an unreasonable amount of complexity.
```
%thread_id_x = gpu.thread_id x
%0 = affine.apply #map()[%thread_id_x]
%c128 = arith.constant 128 : index
affine.for %arg4 = %0 to %c128 step 8 {
%c32 = arith.constant 32 : index
}
```
---
Full diff: https://github.com/llvm/llvm-project/pull/118478.diff
4 Files Affected:
- (modified) mlir/lib/Dialect/Affine/IR/AffineOps.cpp (+6)
- (modified) mlir/lib/Dialect/Affine/IR/CMakeLists.txt (+1)
- (modified) mlir/test/Dialect/Affine/ops.mlir (+36)
- (modified) mlir/test/Dialect/GPU/transform-gpu.mlir (+28-28)
``````````diff
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index dceebbfec586c8..cf355515deb63d 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -8,6 +8,7 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/UB/IR/UBOps.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
@@ -410,6 +411,7 @@ bool mlir::affine::isValidSymbol(Value value) {
/// A value can be used as a symbol for `region` iff it meets one of the
/// following conditions:
/// *) It is a constant.
+/// *) It is a threadId Op.
/// *) It is the result of an affine apply operation with symbol arguments.
/// *) It is a result of the dim op on a memref whose corresponding size is
/// a valid symbol.
@@ -443,6 +445,10 @@ bool mlir::affine::isValidSymbol(Value value, Region *region) {
if (matchPattern(defOp, m_Constant(&operandCst)))
return true;
+ // ThreadId operation is ok.
+ if (isa<gpu::ThreadIdOp>(defOp))
+ return true;
+
// Affine apply operation is ok if all of its operands are ok.
if (auto applyOp = dyn_cast<AffineApplyOp>(defOp))
return applyOp.isValidSymbol(region);
diff --git a/mlir/lib/Dialect/Affine/IR/CMakeLists.txt b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
index 7f7a01be891e05..9dad5cdb28cbc4 100644
--- a/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
@@ -22,4 +22,5 @@ add_mlir_dialect_library(MLIRAffineDialect
MLIRSideEffectInterfaces
MLIRUBDialect
MLIRValueBoundsOpInterface
+ MLIRGPUDialect
)
diff --git a/mlir/test/Dialect/Affine/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir
index c6bfb688db1c1d..5bd556619f3d5a 100644
--- a/mlir/test/Dialect/Affine/ops.mlir
+++ b/mlir/test/Dialect/Affine/ops.mlir
@@ -324,3 +324,39 @@ module attributes {gpu.container_module} {
// CHECK: affine.for %[[VAL_4:.*]] = %[[VAL_3]] to %[[VAL_2]] step 32 {
// CHECK: }
// CHECK: gpu.return
+
+// -----
+
+#map = affine_map<()[s0] -> (s0 mod 32)>
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 mod 32)>
+
+module {
+ gpu.module @gpu {
+ gpu.func @affine_thread_id(%arg0: memref<?x?xf32>) kernel {
+ %c3 = arith.constant 3 : index
+ %dim = memref.dim %arg0, %c3 : memref<?x?xf32>
+ %c0 = arith.constant 0 : index
+ affine.for %arg3 = %c0 to %dim step 32 {
+ %thread_id_x = gpu.thread_id x
+ %0 = affine.apply #map()[%thread_id_x]
+ %c128 = arith.constant 128 : index
+ affine.for %arg4 = %0 to %c128 step 8 {
+ %c32 = arith.constant 32 : index
+ }
+ }
+ gpu.return
+ }
+ }
+}
+
+// CHECK-LABEL: @affine_thread_id
+// CHECK-SAME: (%[[VAL_0:.*]]: memref<?x?xf32>) kernel {
+// CHECK: %[[VAL_1:.*]] = arith.constant 3 : index
+// CHECK: %[[VAL_2:.*]] = memref.dim %[[VAL_0]], %[[VAL_1]] : memref<?x?xf32>
+// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK: affine.for %[[VAL_4:.*]] = %[[VAL_3]] to %[[VAL_2]] step 32 {
+// CHECK: %[[VAL_5:.*]] = gpu.thread_id x
+// CHECK: %[[VAL_6:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[VAL_5]]]
+// CHECK: %[[VAL_7:.*]] = arith.constant 128 : index
+// CHECK: affine.for %[[VAL_8:.*]] = %[[VAL_6]] to %[[VAL_7]] step 8 {
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 72572c6a38de12..6018eb40bac2a8 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -43,7 +43,7 @@ module attributes {transform.with_named_sequence} {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 128)>
+// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 floordiv 128)>
// CHECK-LABEL: func.func @warpgroup_3d(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -61,7 +61,7 @@ func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
// CHECK: gpu.launch
// CHECK: %[[TIDX:.*]] = gpu.thread_id x
// CHECK: %[[TIDY:.*]] = gpu.thread_id y
-// CHECK-DAG: %[[WG:.*]] = affine.apply #[[$MAP]](%[[TIDX]])
+// CHECK-DAG: %[[WG:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C384]] : index
// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
@@ -95,7 +95,7 @@ module attributes {transform.with_named_sequence} {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+// CHECK-DAG: #map = affine_map<()[s0] -> (s0 floordiv 16)>
// CHECK-LABEL: func.func @warp_3d(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -114,7 +114,7 @@ func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g
// CHECK: gpu.launch
// CHECK: %[[TIDX:.*]] = gpu.thread_id x
// CHECK: %[[TIDY:.*]] = gpu.thread_id y
-// CHECK-DAG: %[[W:.*]] = affine.apply #[[$MAP]](%[[TIDX]])
+// CHECK-DAG: %[[W:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index
// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
@@ -354,9 +354,9 @@ module attributes {transform.with_named_sequence} {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 32 + d2 * 256)>
-// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 32) floordiv 128) mod 2)>
-// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<(d0, d1, d2) -> (d2 + ((d0 + d1 * 32) floordiv 128) floordiv 2)>
+// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
+// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 32) floordiv 128) mod 2)>
+// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<()[s0, s1, s2] -> (s2 + ((s0 + s1 * 32) floordiv 128) floordiv 2)>
// CHECK-LABEL: func.func @warpgroup_linear(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -376,9 +376,9 @@ func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %st
// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x
// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y
// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z
-// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]](%[[TIDX]], %[[TIDY]])
-// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]]()[%[[TIDX]], %[[TIDY]]]
+// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C768]] : index
// CHECK: scf.if %[[CMPLIN]]
// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
@@ -410,9 +410,9 @@ module attributes {transform.with_named_sequence} {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 32 + d2 * 256)>
-// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) mod 2)>
-// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) floordiv 2)>
+// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) mod 2)>
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) floordiv 2)>
// CHECK-LABEL: func.func @warp_linear(
// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -432,9 +432,9 @@ func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x
// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y
// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z
-// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C192]] : index
// CHECK: scf.if %[[CMPLIN]]
// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
@@ -466,12 +466,12 @@ module attributes {transform.with_named_sequence} {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 18) floordiv 32) mod 3)>
-// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 18) floordiv 32) mod 6) floordiv 3)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 18) floordiv 32) mod 3)>
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 18) floordiv 32) mod 6) floordiv 3)>
-// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 18)>
-// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) mod 10)>
-// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) floordiv 10)>
+// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 18)>
+// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) mod 10)>
+// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) floordiv 10)>
// CHECK-LABEL: func.func @map_multi_level_linear(
func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
@@ -504,9 +504,9 @@ func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f3
memref.store %6, %y[%i, %j] : !type
} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
- // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
- // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
- // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
+ // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]]()[%[[TIDX]], %[[TIDY]]]
+ // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]]]
+ // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]]]
// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[LIN]], %[[C192]] : index
// CHECK: scf.if %[[CMPLIN]]
scf.forall (%i, %j, %k) in (%c3, %c2, %c1) {
@@ -515,8 +515,8 @@ func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f3
memref.store %8, %y[%i, %j] : !type
} {mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_2>] }
- // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
- // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
+ // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]]()[%[[TIDX]], %[[TIDY]]]
+ // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]]()[%[[TIDX]], %[[TIDY]]]
// CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
// CHECK: scf.if %[[COND]]
// CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>
@@ -648,7 +648,7 @@ module attributes {transform.with_named_sequence} {
#map1 = affine_map<(d0) -> (d0 * 32)>
// CHECK-DAG: #[[$MAPB:.*]] = affine_map<(d0) -> (d0 * 128)>
-// CHECK-DAG: #[[$MAPW:.*]] = affine_map<(d0, d1, d2) -> (d2 * 32 + ((d0 + d1 * 4) floordiv 32) * 32)>
+// CHECK-DAG: #[[$MAPW:.*]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
// CHECK-LABEL: func.func @simple_fill(
func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
@@ -667,7 +667,7 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
// CHECK: %[[TIDX:.*]] = gpu.thread_id x
// CHECK: %[[TIDY:.*]] = gpu.thread_id y
// CHECK: %[[TIDZ:.*]] = gpu.thread_id z
-// CHECK: %[[THX:.*]] = affine.apply #[[$MAPW]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+// CHECK: %[[THX:.*]] = affine.apply #[[$MAPW]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
// CHECK-NOT: scf.if
// CHECK: memref.subview %{{.*}}[%[[THX]]]
%1 = affine.apply #map1(%arg2)
``````````
</details>
https://github.com/llvm/llvm-project/pull/118478
More information about the Mlir-commits
mailing list