[Mlir-commits] [mlir] [mlir] Clamp UnPackOp tiling sizes from operand tile (PR #112429)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Oct 29 07:11:13 PDT 2024
https://github.com/Max191 updated https://github.com/llvm/llvm-project/pull/112429
>From a14950490e938f0475dcb7d7ca26f08b8e92907d Mon Sep 17 00:00:00 2001
From: Max Dawkins <max.dawkins at gmail.com>
Date: Mon, 14 Oct 2024 14:21:37 -0500
Subject: [PATCH 1/2] [mlir] Clamp UnPackOp tiling sizes from operand tile
Signed-off-by: Max Dawkins <max.dawkins at gmail.com>
---
.../SCF/Transforms/TileUsingInterface.cpp | 10 +++++---
.../Tensor/IR/TensorTilingInterfaceImpl.cpp | 25 +++++++++++++++----
.../tile-and-fuse-consumer.mlir | 14 ++++++-----
3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index e2feb10b314540..1f53d595c372cd 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1880,13 +1880,17 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter,
candidateSliceOp, "containingOp's result yield with stride");
}
- // 10. Try to get iter domain position from input position.
+ // 10. Try to get iter domain position from input position. Use
+ // clonedConsumerOp instead of tiledConsumerOp, because the iteration domain
+ // may require index computation based on the result size. The sizes and
+ // offsets should be the same either way, but using tiledConsumerOp could
+ // lead to some chained unnecessary extra index computation.
SmallVector<OpFoldResult> iterDomainOffsets, iterDomainSizes;
- if (failed(tiledConsumerOp.getIterationDomainTileFromOperandTile(
+ if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile(
rewriter, operandNumber, offsets, sizes, iterDomainOffsets,
iterDomainSizes))) {
return rewriter.notifyMatchFailure(
- tiledConsumerOp,
+ clonedConsumerOp,
"can't get iter domain position from input position");
}
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index ba41904b370991..f14054544d0439 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -16,6 +16,7 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Tensor/Utils/Utils.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/TilingInterface.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
@@ -621,6 +622,12 @@ struct UnPackOpTiling
SmallVectorImpl<OpFoldResult> &resultOffsets,
SmallVectorImpl<OpFoldResult> &resultSizes) const {
auto unPackOp = cast<UnPackOp>(op);
+ // If the operand tile is the dest, then no adjustment is needed.
+ if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) {
+ resultOffsets = llvm::to_vector(offsets);
+ resultSizes = llvm::to_vector(sizes);
+ return success();
+ }
Location loc = unPackOp.getLoc();
int64_t numTiles = unPackOp.getInnerDimsPos().size();
@@ -629,6 +636,11 @@ struct UnPackOpTiling
// The tiling is applied on interchanged dimensions. We have to undo the
// interchange to map sizes and offsets to the original input.
int64_t outputRank = unPackOp.getDestRank();
+ ReifiedRankedShapedTypeDims reifiedReturnShapes;
+ if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) {
+ return failure();
+ }
+ SmallVector<OpFoldResult> outputMixedSizes = reifiedReturnShapes.front();
SmallVector<OpFoldResult> origOffsets(destOffsets);
SmallVector<OpFoldResult> origSizes(destSizes);
applyPermToRange(origOffsets, origSizes,
@@ -640,18 +652,21 @@ struct UnPackOpTiling
for (auto dim : llvm::seq<int64_t>(0, outputRank)) {
using AV = affine::AffineValueExpr;
affine::AffineBuilder ab(b, loc);
- AffineExpr dim0, dim1, sym;
+ AffineExpr dim0, dim1, sym0;
bindDims(b.getContext(), dim0, dim1);
- bindSymbols(b.getContext(), sym);
+ bindSymbols(b.getContext(), sym0);
if (dimAndTileMapping.count(dim)) {
// If the data dimension is tiled, the i-th index is the product of
// offset_i and tile_i, and the i-th size is the product of sizes_i and
- // tile_i.
+ // tile_i. The sizes must be clamped to the sizes of the unpack result.
auto avOffset = AV(dim0).bind(origOffsets[dim]);
auto avSize = AV(dim0).bind(origSizes[dim]);
- auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
+ auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]);
+ auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]);
resultOffsets.push_back(ab.mul(avOffset, avTileSize));
- resultSizes.push_back(ab.mul(avSize, avTileSize));
+ auto avResultOffset = AV(dim1).bind(resultOffsets.back());
+ resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize),
+ ab.sub(avResultSize, avResultOffset)}));
} else {
resultOffsets.push_back(origOffsets[dim]);
resultSizes.push_back(origSizes[dim]);
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index f5f703d95e2d5b..d4ed3e3419a70a 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -265,7 +265,7 @@ module {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
- %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+ %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
^bb0(%in: f32, %in_16: f32, %out: f32):
@@ -292,26 +292,28 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
-// CHECK: #[[UNPACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
+// CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
+// CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2048)>
// CHECK: func.func @fuse_unpack_consumer_into_scf_forall(
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>)
// CHECK: %[[OUT_INIT:.*]] = tensor.empty() : tensor<2048xf32>
-// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
+// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) = (0, 0) to (64, 32) step (32, 32)
// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[UNPACK_OUT_ARG:.*]] = %[[OUT_INIT]])
// CHECK-SAME: {
// CHECK: %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
// CHECK: %[[GENERIC_OUT:.*]] = linalg.generic
// CHECK-SAME: outs(%[[GENERIC_OUT_SLICE]] :
-// CHECK: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_MAP]](%[[IV1]])
-// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1]
+// CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]])
+// CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]])
+// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1]
// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]]
// CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32]
// CHECK-SAME: into %[[TILED_UNPACK_DEST]]
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
-// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1]
+// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1]
// CHECK: }
// CHECK: }
// CHECK: return %[[FINAL_RESULT]]#1 :
>From ab7a5b267189205cca7917e9b7b0c4ca73bc083c Mon Sep 17 00:00:00 2001
From: Max Dawkins <max.dawkins at gmail.com>
Date: Tue, 29 Oct 2024 10:10:56 -0400
Subject: [PATCH 2/2] address comments
Signed-off-by: Max Dawkins <max.dawkins at gmail.com>
---
.../Tensor/IR/TensorTilingInterfaceImpl.cpp | 3 +-
.../tile-and-fuse-consumer.mlir | 61 +++++++++++++++++++
2 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index f14054544d0439..68c3d1cabb11cb 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -637,9 +637,8 @@ struct UnPackOpTiling
// interchange to map sizes and offsets to the original input.
int64_t outputRank = unPackOp.getDestRank();
ReifiedRankedShapedTypeDims reifiedReturnShapes;
- if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) {
+ if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes)))
return failure();
- }
SmallVector<OpFoldResult> outputMixedSizes = reifiedReturnShapes.front();
SmallVector<OpFoldResult> origOffsets(destOffsets);
SmallVector<OpFoldResult> origSizes(destSizes);
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index d4ed3e3419a70a..6dddc88681a325 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -320,6 +320,67 @@ module attributes {transform.with_named_sequence} {
// -----
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> {
+ %c4 = arith.constant 4 : index
+ %c64 = arith.constant 64 : index
+ %c0 = arith.constant 0 : index
+ %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+ %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+ %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+ ^bb0(%in: f32, %in_16: f32, %out: f32):
+ %13 = arith.mulf %in, %in_16 : f32
+ %14 = arith.addf %out, %13 : f32
+ linalg.yield %14 : f32
+ } -> tensor<32x32xf32>
+ scf.forall.in_parallel {
+ tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+ }
+ }
+ %output = tensor.empty() : tensor<2047xf32>
+ %unpack = tensor.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32>
+ return %unpack : tensor<2047xf32>
+ }
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+ %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+ : (!transform.any_op) -> !transform.any_op
+ %a, %b = transform.test.fuse_consumer %slice_op
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+// CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
+// CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2047)>
+// CHECK: func.func @fuse_unaligned_unpack_consumer_into_scf_forall(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>)
+// CHECK: %[[OUT_INIT:.*]] = tensor.empty() : tensor<2047xf32>
+// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) = (0, 0) to (64, 32) step (32, 32)
+// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[UNPACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+// CHECK-SAME: {
+// CHECK: %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
+// CHECK: %[[GENERIC_OUT:.*]] = linalg.generic
+// CHECK-SAME: outs(%[[GENERIC_OUT_SLICE]] :
+// CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]])
+// CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]])
+// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1]
+// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]]
+// CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32]
+// CHECK-SAME: into %[[TILED_UNPACK_DEST]]
+// CHECK: scf.forall.in_parallel {
+// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
+// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1]
+// CHECK: }
+// CHECK: }
+// CHECK: return %[[FINAL_RESULT]]#1 :
+
+// -----
+
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
More information about the Mlir-commits
mailing list