[Mlir-commits] [mlir] ba7f3e1 - [mlir][Transform] Fix support for mapping to GPU warps and to linear ids
Nicolas Vasilache
llvmlistbot at llvm.org
Mon Mar 20 05:25:11 PDT 2023
Author: Nicolas Vasilache
Date: 2023-03-20T05:23:17-07:00
New Revision: ba7f3e1d1e50212bdc8cc438185519fd7257aa44
URL: https://github.com/llvm/llvm-project/commit/ba7f3e1d1e50212bdc8cc438185519fd7257aa44
DIFF: https://github.com/llvm/llvm-project/commit/ba7f3e1d1e50212bdc8cc438185519fd7257aa44.diff
LOG: [mlir][Transform] Fix support for mapping to GPU warps and to linear ids
c59465e1203dd78d06e15f7ddf62141807dbd5a7 introduced mapping to warps and
linear GPU ids.
In the implementation, the delinearization basis is reversed from [x, y, z]
to [z, y x] order to properly compute the strides and allow delinearization.
Prior to this commit, we forgot to reverse it back to [x, y, z] order
before materializing the indices.
Fix this oversight.
Added:
Modified:
mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
mlir/test/Dialect/GPU/transform-gpu.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 0f566e4bdea5..550d8c1f7e44 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -140,12 +140,15 @@ struct GpuWarpIdBuilder : public GpuIdBuilder {
OpFoldResult warpIdOfr = makeComposedFoldedAffineApply(
rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
+ // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+ // "row-major" order.
SmallVector<int64_t> reverseBasisSizes(
llvm::reverse(this->availableMappingSizes));
SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
SmallVector<Value> ids;
- for (AffineExpr e : delinearizingExprs)
+ // Reverse back to be in [x, y, z] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs))
ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId));
// clang-format off
@@ -191,13 +194,16 @@ struct GpuLinearIdBuilder : public GpuIdBuilder {
// Build the linear thread id and decompose it in the basis of
// `forallMappingSizes`.
Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+ // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+ // "row-major" order.
SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
AffineExpr d0;
bindDims(rewriter.getContext(), d0);
SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
SmallVector<Value> ids;
- for (AffineExpr e : delinearizingExprs)
+ // Reverse back to be in [x, y, z] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs))
ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearId));
// clang-format off
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index e54af051c344..e485d4107a64 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -241,12 +241,12 @@ transform.sequence failures(propagate) {
!type = memref<2 x 32 x f32>
!type1d = memref<32 x f32>
-// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) floordiv 4)>
-// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 4) floordiv 2)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) mod 3)>
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 6) floordiv 3)>
// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 12)>
-// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) floordiv 20)>
-// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)>
+// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) mod 10)>
+// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)>
// CHECK-LABEL: func.func @map_multi_level(
func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
@@ -277,11 +277,11 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str
memref.store %6, %y[%i, %j] : !type
} { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
- // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
+ // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index
// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index
- // CHECK: %[[COND:.*]] = arith.andi %[[CMPY]], %[[CMPX]] : i1
+ // CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
// CHECK: scf.if %[[COND]]
scf.forall (%i) in (%c1) {
%7 = memref.load %t[%i] : !type1d
@@ -290,10 +290,12 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str
} {mapping = [#gpu.warp<x>] }
// CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
+ // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
- // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
// CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
// CHECK: scf.if %[[COND]]
+ // CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>
+ // CHECK: memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32>
scf.forall (%i, %j) in (%c10, %c2) {
%7 = memref.load %t[%i] : !type1d
%8 = arith.addf %alpha, %7 : f32
@@ -308,6 +310,6 @@ transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
transform.gpu.map_nested_forall_to_threads %funcop
- block_dims = [12, 11, 1] warp_dims = [2, 2, 1]
+ block_dims = [12, 11, 1] warp_dims = [3, 2, 1]
: (!pdl.operation) -> ()
}
More information about the Mlir-commits
mailing list