[llvm-branch-commits] [mlir] dbf9bed - [mlir][Linalg] Add a hoistPaddingOnTensors transformation
Nicolas Vasilache via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 25 04:46:57 PST 2021
Author: Nicolas Vasilache
Date: 2021-01-25T12:41:18Z
New Revision: dbf9bedf40792cf8c5492a27b61809737793b9c7
URL: https://github.com/llvm/llvm-project/commit/dbf9bedf40792cf8c5492a27b61809737793b9c7
DIFF: https://github.com/llvm/llvm-project/commit/dbf9bedf40792cf8c5492a27b61809737793b9c7.diff
LOG: [mlir][Linalg] Add a hoistPaddingOnTensors transformation
This transformation anchors on a padding op whose result is only used as an input
to a Linalg op and pulls it out of a given number of loops.
The result is a packing of padded tailes of ops that is amortized just before
the outermost loop from which the pad operation is hoisted.
Differential revision: https://reviews.llvm.org/D95243
Added:
mlir/test/Dialect/Linalg/hoist-padding.mlir
Modified:
mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
mlir/lib/Dialect/StandardOps/IR/Ops.cpp
mlir/test/lib/Transforms/TestLinalgTransforms.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
index ed585d1f5cf5..4d44b3717991 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
@@ -11,8 +11,10 @@
namespace mlir {
class FuncOp;
+struct LogicalResult;
namespace linalg {
+class SimplePadOp;
/// Hoist alloc/dealloc pairs and alloca op out of immediately enclosing
/// scf::ForOp if both conditions are true:
@@ -40,6 +42,44 @@ void hoistRedundantVectorTransfers(FuncOp func);
/// instead of buffers.
void hoistRedundantVectorTransfersOnTensor(FuncOp func);
+/// Mechanically hoist padding operations on tensors by `nLoops` into a new,
+/// generally larger tensor. This achieves packing of multiple padding ops into
+/// a larger tensor. On success, `simplePadOp` is replaced by the cloned version
+/// in the packing loop so the caller can continue reasoning about the padding
+/// operation.
+///
+/// Example in pseudo-mlir:
+/// =======================
+///
+/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
+/// ```
+/// scf.for (%i, %j, %k)
+/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+/// %0 = linalg.simple_pad %st0 pad %pad :
+/// tensor<?x?xf32> to tensor<4x8xf32>
+/// compute(%0)
+/// ```
+///
+/// IR resembling the following is produced:
+///
+/// ```
+/// scf.for (%i) {
+/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
+/// %packed = scf.for (%k) iter_args(%p : %packed_init)
+/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+/// %0 = linalg.simple_pad %st0 pad %pad :
+/// tensor<?x?xf32> to tensor<4x8xf32>
+/// scf.yield %1: tensor<?x4x8xf32>
+/// } -> tensor<?x4x8xf32>
+/// scf.for (%j, %k) {
+/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
+/// tensor<?x4x8xf32> to tensor<4x8xf32>
+/// compute(%st0)
+/// }
+/// }
+/// ```
+LogicalResult hoistPaddingOnTensors(SimplePadOp &simplePadOp, unsigned nLoops);
+
} // namespace linalg
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
index ce1907cb6435..1c21b1639b7e 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -3058,6 +3058,17 @@ def SubTensorOp : BaseOpWithOffsetSizesAndStrides<
// Build a SubTensorOp with all dynamic entries and custom result type.
OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source,
"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
+ CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+ // Build a SubTensorOp with mixed static and dynamic entries and inferred
+ // result type.
+ OpBuilderDAG<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets,
+ "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
+ CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+ // Build a SubTensorOp with mixed static and dynamic entries and custom
+ // result type. If the type passed is nullptr, it is inferred.
+ OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source,
+ "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+ "ArrayRef<OpFoldResult>":$strides,
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
];
@@ -3154,6 +3165,11 @@ def SubTensorInsertOp : BaseOpWithOffsetSizesAndStrides<
// Build a SubTensorInsertOp with all dynamic entries.
OpBuilderDAG<(ins "Value":$source, "Value":$dest, "ValueRange":$offsets,
"ValueRange":$sizes, "ValueRange":$strides,
+ CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+ // Build a SubTensorInsertOp with mixed static and dynamic entries.
+ OpBuilderDAG<(ins "Value":$source, "Value":$dest,
+ "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+ "ArrayRef<OpFoldResult>":$strides,
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
];
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 98d61fa6a8d9..7f1ead8ca386 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -334,3 +334,253 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
});
}
}
+
+/// Ensure prerequisites that guarantee pad op hoisting can occur.
+/// Return failure in the cases when we cannot perform hoisting; i.e. if either:
+/// 1. There exists a use of `simplePadOp` that is not a linalg input operand.
+/// 2. There isn't an enclosing `outermostEnclosingForOp` loop.
+/// 3. There exists an op with a region that is dominated by
+/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
+/// LinalgOp.
+///
+/// While ensuring prerequisites:
+/// 1. Fill the `backwardSlice` to contain the topologically sorted ops
+/// dominated by `outermostEnclosingForOp`.
+/// 2. Fill the `packingLoops` to contain only the enclosing loops of
+/// `backwardSlice` whose IV is actually used in computing padding. Loops that
+/// remain in `backwardSlice` but that are not in `packingLoops` are
+/// dimensions of reuse.
+static LogicalResult
+hoistPaddingOnTensorsPrerequisites(linalg::SimplePadOp simplePadOp, int nLevels,
+ llvm::SetVector<Operation *> &backwardSlice,
+ llvm::SetVector<Operation *> &packingLoops) {
+ // Bail on any use that isn't an input of a Linalg op.
+ // Hoisting of inplace updates happens after vectorization.
+ for (OpOperand &use : simplePadOp.result().getUses()) {
+ auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
+ if (!linalgUser || !linalgUser.isInputTensor(&use))
+ return failure();
+ }
+
+ // Get at most nLevels of enclosing loops.
+ SmallVector<LoopLikeOpInterface> reverseEnclosingLoops;
+ Operation *outermostEnclosingForOp = nullptr,
+ *nextEnclosingForOp =
+ simplePadOp->getParentOfType<LoopLikeOpInterface>();
+ while (nLevels-- > 0 && nextEnclosingForOp) {
+ outermostEnclosingForOp = nextEnclosingForOp;
+ reverseEnclosingLoops.push_back(outermostEnclosingForOp);
+ nextEnclosingForOp =
+ nextEnclosingForOp->getParentOfType<LoopLikeOpInterface>();
+ }
+ if (!outermostEnclosingForOp)
+ return failure();
+
+ // Get the backwards slice from `simplePadOp` that is dominated by the
+ // outermost enclosing loop.
+ DominanceInfo domInfo(outermostEnclosingForOp);
+ getBackwardSlice(simplePadOp, &backwardSlice, [&](Operation *op) {
+ return domInfo.dominates(outermostEnclosingForOp, op);
+ });
+
+ // Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp.
+ if (llvm::any_of(backwardSlice, [](Operation *op) {
+ return op->getNumRegions() > 0 && !isa<LoopLikeOpInterface>(op) &&
+ !isa<LinalgOp>(op);
+ }))
+ return failure();
+
+ // Filter out the loops whose induction variable is not used to compute the
+ // padded result. As a first approximation, just look for IVs that have no use
+ // in the backwardSlice.
+ // These are the dimensions of reuse that we can exploit to reduce the amount
+ // of work / memory.
+ // TODO: would this optimization compose better as a canonicalization?
+ for (LoopLikeOpInterface loop : reverseEnclosingLoops) {
+ auto forOp = dyn_cast<scf::ForOp>(loop.getOperation());
+ if (!forOp)
+ continue;
+ for (Operation *user : forOp.getInductionVar().getUsers()) {
+ if (backwardSlice.contains(user)) {
+ packingLoops.insert(forOp);
+ break;
+ }
+ }
+ }
+
+ // Backward slice is a topologically sorted list of ops starting at
+ // `outermostEnclosingForOp`.
+ assert(outermostEnclosingForOp == backwardSlice.front());
+
+ return success();
+}
+
+static Value buildLoopTripCount(OpBuilder &b, Operation *op) {
+ MLIRContext *ctx = op->getContext();
+ AffineExpr lb, ub, step = getAffineSymbolExpr(0, ctx);
+ bindDims(ctx, lb, ub);
+ scf::ForOp forOp = cast<scf::ForOp>(op);
+ return b.create<AffineApplyOp>(
+ op->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
+ ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
+}
+
+/// Mechanically hoist padding operations on tensors by at most `nLoops` into a
+/// new, generally larger tensor. This achieves packing of multiple padding ops
+/// into a larger tensor. On success, `simplePadOp` is replaced by the cloned
+/// version in the packing loop so the caller can continue reasoning about the
+/// padding operation.
+///
+/// Example in pseudo-mlir:
+/// =======================
+///
+/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
+/// ```
+/// scf.for (%i, %j, %k)
+/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+/// %0 = linalg.simple_pad %st0 pad %pad :
+/// tensor<?x?xf32> to tensor<4x8xf32>
+/// compute(%0)
+/// ```
+///
+/// IR resembling the following is produced:
+///
+/// ```
+/// scf.for (%i) {
+/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
+/// %packed = scf.for (%k) iter_args(%p : %packed_init)
+/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+/// %0 = linalg.simple_pad %st0 pad %pad :
+/// tensor<?x?xf32> to tensor<4x8xf32>
+/// scf.yield %1: tensor<?x4x8xf32>
+/// } -> tensor<?x4x8xf32>
+/// scf.for (%j, %k) {
+/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
+/// tensor<?x4x8xf32> to tensor<4x8xf32>
+/// compute(%st0)
+/// }
+/// }
+/// ```
+LogicalResult mlir::linalg::hoistPaddingOnTensors(SimplePadOp &simplePadOp,
+ unsigned nLoops) {
+ llvm::SetVector<Operation *> backwardSlice, packingLoops;
+ if (failed(hoistPaddingOnTensorsPrerequisites(simplePadOp, nLoops,
+ backwardSlice, packingLoops)))
+ return failure();
+
+ // Update actual number of loops, which may be smaller.
+ nLoops = packingLoops.size();
+
+ Location loc = simplePadOp->getLoc();
+ RankedTensorType paddedTensorType = simplePadOp.getResultType();
+ unsigned paddedRank = paddedTensorType.getRank();
+
+ // Backward slice is a topologically sorted list of ops starting at
+ // `outermostEnclosingForOp`.
+ Operation *outermostEnclosingForOp = backwardSlice.front();
+ // IP just before the outermost loop considered that we hoist above.
+ OpBuilder b(outermostEnclosingForOp);
+
+ // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
+ // padding.
+ SmallVector<int64_t> packedShape(nLoops, ShapedType::kDynamicSize);
+ // TODO: go grab dims when necessary, for now SimplePadOp returns a static
+ // tensor.
+ llvm::append_range(packedShape, paddedTensorType.getShape());
+ auto packedTensorType =
+ RankedTensorType::get(packedShape, paddedTensorType.getElementType());
+ auto dynamicSizes = llvm::to_vector<4>(llvm::map_range(
+ packingLoops, [&](Operation *op) { return buildLoopTripCount(b, op); }));
+ Value packedTensor = b.create<linalg::InitTensorOp>(
+ loc, dynamicSizes, packedTensorType.getShape(),
+ packedTensorType.getElementType());
+
+ // Clone the operations involved in the backward slice, iteratively stepping
+ // into the loops that we encounter.
+ // The implementation proceeds in a stack-like fashion:
+ // 1. Iteratively clone and step into the loops, pushing the `packedTensor`
+ // deeper in the stack.
+ // 2. Create a SubTensorInsert at the top of the stack.
+ // 3. Iteratively pop and yield the result of the SubTensorInsertOp across
+ // the cloned loops.
+ SmallVector<Value> clonedLoopIvs;
+ clonedLoopIvs.reserve(nLoops);
+ BlockAndValueMapping bvm;
+ // Stack step 1. iteratively clone loops and push `packedTensor`.
+ // Insert `simplePadOp` into the backwardSlice so we clone it too.
+ backwardSlice.insert(simplePadOp);
+ for (Operation *op : backwardSlice) {
+ if (op->getNumRegions() == 0) {
+ b.clone(*op, bvm);
+ continue;
+ }
+ // TODO: support more cases as they appear.
+ auto forOp = dyn_cast<scf::ForOp>(op);
+ assert(forOp && "Expected scf::ForOp when hoisting pad ops");
+ // Unused loop, just skip it.
+ if (!packingLoops.contains(forOp))
+ continue;
+ auto clonedForOp =
+ b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
+ forOp.step(), packedTensor);
+ assert(clonedForOp->getNumRegions() == 1);
+ clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+ b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
+ bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+ packedTensor = clonedForOp.getRegionIterArgs().front();
+ }
+
+ // Stack step 2. create SubTensorInsertOp at the top of the stack.
+ // offsets = [clonedLoopIvs, 0 .. 0].
+ SmallVector<OpFoldResult> offsets(clonedLoopIvs.begin(), clonedLoopIvs.end());
+ offsets.append(paddedRank, b.getIndexAttr(0));
+ // sizes = [1 .. 1, paddedShape].
+ SmallVector<OpFoldResult> sizes(nLoops, b.getIndexAttr(1));
+ for (int64_t sz : paddedTensorType.getShape()) {
+ // TODO: go grab dims when necessary, for now SimplePadOp returns a static
+ // tensor.
+ assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
+ sizes.push_back(b.getIndexAttr(sz));
+ }
+ // strides = [1 .. 1].
+ SmallVector<OpFoldResult> strides(nLoops + paddedRank, b.getIndexAttr(1));
+
+ Value inserted =
+ b.create<SubTensorInsertOp>(loc, bvm.lookup(simplePadOp.result()),
+ packedTensor, offsets, sizes, strides);
+
+ // Stack step 3. iteratively pop the stack and propagate the yield.
+ Value valueToYield = inserted;
+ for (Value iv : llvm::reverse(clonedLoopIvs)) {
+ auto forOp = scf::getForInductionVarOwner(iv);
+ b.setInsertionPointToEnd(&forOp.getRegion().front());
+ b.create<scf::YieldOp>(loc, valueToYield);
+ valueToYield = forOp.getResult(0);
+ }
+
+ // Now the packed tensor is ready, replace the original padding op by a
+ // 1x..x1 SubTensor [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
+ b.setInsertionPoint(simplePadOp);
+ SmallVector<Value> originalLoopIvs =
+ llvm::to_vector<4>(llvm::map_range(packingLoops, [](Operation *loop) {
+ return cast<scf::ForOp>(loop).getInductionVar();
+ }));
+ // offsets = [originalLoopIvs, 0 .. 0].
+ offsets.assign(originalLoopIvs.begin(), originalLoopIvs.end());
+ offsets.append(paddedRank, b.getIndexAttr(0));
+ // sizes = [1 .. 1, paddedShape] (definedabove).
+ // strides = [1 .. 1] (defined above)
+ packedTensor =
+ scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
+ simplePadOp.replaceAllUsesWith(
+ b.create<SubTensorOp>(loc, simplePadOp.getResultType(), packedTensor,
+ offsets, sizes, strides)
+ ->getResult(0));
+ simplePadOp.erase();
+
+ // Make the newly cloned `simplePadOp` available to the caller.
+ simplePadOp =
+ cast<SimplePadOp>(bvm.lookup(simplePadOp.result()).getDefiningOp());
+
+ return success();
+}
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index b8671cfe48fe..3982345080d3 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -3505,6 +3505,69 @@ void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
staticStridesVector, offsets, sizes, strides, attrs);
}
+/// Dispatch `ofr` into either `dynamicVec` if it is a Value or into `staticVec`
+/// otherwise. In the dynamic case, `sentinel` is appended to `staticVec` to
+/// represent the dynamic value `?`.
+static void unpackOpFoldResult(OpFoldResult ofr,
+ SmallVectorImpl<Value> &dynamicVec,
+ SmallVectorImpl<int64_t> &staticVec,
+ int64_t sentinel) {
+ Value v = ofr.dyn_cast<Value>();
+ if (v) {
+ dynamicVec.push_back(v);
+ staticVec.push_back(sentinel);
+ } else {
+ APInt apInt = ofr.dyn_cast<Attribute>().cast<IntegerAttr>().getValue();
+ staticVec.push_back(apInt.getSExtValue());
+ }
+}
+
+static void unpackOpFoldResults(ArrayRef<OpFoldResult> ofrs,
+ SmallVector<Value> &dynamicVec,
+ SmallVector<int64_t> &staticVec,
+ int64_t sentinel) {
+ for (auto ofr : ofrs)
+ unpackOpFoldResult(ofr, dynamicVec, staticVec, sentinel);
+}
+
+// Build a SubTensorOp with mixed static and dynamic entries and custom result
+// type. If the type passed is nullptr, it is inferred.
+void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
+ RankedTensorType resultType, Value source,
+ ArrayRef<OpFoldResult> offsets,
+ ArrayRef<OpFoldResult> sizes,
+ ArrayRef<OpFoldResult> strides,
+ ArrayRef<NamedAttribute> attrs) {
+ SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+ SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+ unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+ ShapedType::kDynamicStrideOrOffset);
+ unpackOpFoldResults(sizes, dynamicSizes, staticSizes,
+ ShapedType::kDynamicSize);
+ unpackOpFoldResults(strides, dynamicStrides, staticStrides,
+ ShapedType::kDynamicStrideOrOffset);
+ auto sourceRankedTensorType = source.getType().cast<RankedTensorType>();
+ // Structuring implementation this way avoids duplication between builders.
+ if (!resultType) {
+ resultType =
+ SubTensorOp::inferResultType(sourceRankedTensorType, staticOffsets,
+ staticSizes, staticStrides)
+ .cast<RankedTensorType>();
+ }
+ build(b, result, resultType, source, staticOffsets, staticSizes,
+ staticStrides, dynamicOffsets, dynamicSizes, dynamicStrides, attrs);
+}
+
+// Build a SubTensorOp with mixed static and dynamic entries and inferred result
+// type.
+void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
+ Value source, ArrayRef<OpFoldResult> offsets,
+ ArrayRef<OpFoldResult> sizes,
+ ArrayRef<OpFoldResult> strides,
+ ArrayRef<NamedAttribute> attrs) {
+ build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
+}
+
/// Verifier for SubTensorOp.
static LogicalResult verify(SubTensorOp op) {
// Verify result type against inferred type.
@@ -3600,6 +3663,25 @@ void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result,
staticStridesVector, offsets, sizes, strides, attrs);
}
+// Build a SubTensorInsertOp with mixed static and dynamic entries.
+void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result,
+ Value source, Value dest,
+ ArrayRef<OpFoldResult> offsets,
+ ArrayRef<OpFoldResult> sizes,
+ ArrayRef<OpFoldResult> strides,
+ ArrayRef<NamedAttribute> attrs) {
+ SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+ SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+ unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+ ShapedType::kDynamicStrideOrOffset);
+ unpackOpFoldResults(sizes, dynamicSizes, staticSizes,
+ ShapedType::kDynamicSize);
+ unpackOpFoldResults(strides, dynamicStrides, staticStrides,
+ ShapedType::kDynamicStrideOrOffset);
+ build(b, result, source, dest, staticOffsets, staticSizes, staticStrides,
+ dynamicOffsets, dynamicSizes, dynamicStrides, attrs);
+}
+
/// Verifier for SubViewOp.
static LogicalResult verify(SubTensorInsertOp op) {
if (op.getType() != op.dest().getType())
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
new file mode 100644
index 000000000000..27750ea8a024
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+
+#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
+#map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
+#map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
+
+// CHECK-LABEL: func @matmul_tensors
+func @matmul_tensors(
+ %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+ -> tensor<?x?xf32>
+{
+ %c2 = constant 2 : index
+ %c3 = constant 3 : index
+ %c4 = constant 4 : index
+ %cst = constant 0.000000e+00 : f32
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+ %0 = dim %arg0, %c0 : tensor<?x?xf32>
+ %1 = dim %arg0, %c1 : tensor<?x?xf32>
+ %2 = dim %arg1, %c1 : tensor<?x?xf32>
+
+ // CHECK: scf.for
+ // CHECK: linalg.init_tensor [%{{.*}}, 2, 4] : tensor<?x2x4xf32>
+ // 1-D loop
+ // CHECK: %[[A:.*]] = scf.for
+ // CHECK-NOT: scf.for
+ // CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ // CHECK: linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<2x4xf32> pad f32
+ // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, 0, 0]
+ // CHECK-SAME: [1, 2, 4] [1, 1, 1] : tensor<2x4xf32> into tensor<?x2x4xf32>
+ // 2-D loop
+ // CHECK: linalg.init_tensor [%{{.*}}, %{{.*}}, 4, 3] : tensor<?x?x4x3xf32>
+ // CHECK: %[[B:.*]] = scf.for
+ // CHECK: scf.for
+ // CHECK-NOT: scf.for
+ // CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ // CHECK: linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<4x3xf32> pad f32
+ // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0]
+ // CHECK-SAME: [1, 1, 4, 3] [1, 1, 1, 1] : tensor<4x3xf32> into tensor<?x?x4x3xf32>
+ // 2-D loop
+ // CHECK: scf.for %[[J:[0-9a-zA-Z]+]]
+ // CHECK: scf.for %[[K:[0-9a-zA-Z]+]]
+ // CHECK-NOT: scf.for
+ // CHECK: %[[stA:.*]] = subtensor %[[A]][%[[K]], 0, 0] [1, 2, 4] [1, 1, 1] :
+ // CHECK-SAME: tensor<?x2x4xf32> to tensor<2x4xf32>
+ // CHECK: %[[stB:.*]] = subtensor %[[B]][%[[K]], %[[J]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] :
+ // CHECK-SAME: tensor<?x?x4x3xf32> to tensor<4x3xf32>
+ // CHECK: %[[stC:.*]] = linalg.simple_pad %{{.*}} pad %{{.*}} :
+ // CHECK-SAME: tensor<?x?xf32> to tensor<2x3xf32> pad f32
+ // CHECK: linalg.matmul ins(%[[stA]], %[[stB]] : tensor<2x4xf32>, tensor<4x3xf32>)
+ // CHECK-SAME: outs(%[[stC]] : tensor<2x3xf32>) -> tensor<2x3xf32>
+ %3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+ %4 = scf.for %arg5 = %c0 to %2 step %c3 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+ %5 = scf.for %arg7 = %c0 to %1 step %c4 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
+ %6 = dim %arg0, %c0 : tensor<?x?xf32>
+ %7 = affine.min #map0(%arg3)[%6]
+ %8 = dim %arg0, %c1 : tensor<?x?xf32>
+ %9 = affine.min #map1(%arg7)[%8]
+ %10 = subtensor %arg0[%arg3, %arg7] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %11 = dim %arg1, %c0 : tensor<?x?xf32>
+ %12 = affine.min #map1(%arg7)[%11]
+ %13 = dim %arg1, %c1 : tensor<?x?xf32>
+ %14 = affine.min #map2(%arg5)[%13]
+ %15 = subtensor %arg1[%arg7, %arg5] [%12, %14] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %16 = dim %arg8, %c0 : tensor<?x?xf32>
+ %17 = affine.min #map3(%16, %arg3)
+ %18 = dim %arg8, %c1 : tensor<?x?xf32>
+ %19 = affine.min #map4(%18, %arg5)
+ %20 = subtensor %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %21 = linalg.simple_pad %10 pad %cst : tensor<?x?xf32> to tensor<2x4xf32> pad f32
+ %22 = linalg.simple_pad %15 pad %cst : tensor<?x?xf32> to tensor<4x3xf32> pad f32
+ %23 = linalg.simple_pad %20 pad %cst : tensor<?x?xf32> to tensor<2x3xf32> pad f32
+ %24 = linalg.matmul ins(%21, %22 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%23 : tensor<2x3xf32>) -> tensor<2x3xf32>
+ %25 = subtensor %24[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
+ %26 = subtensor_insert %25 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
+ scf.yield %26 : tensor<?x?xf32>
+ }
+ scf.yield %5 : tensor<?x?xf32>
+ }
+ scf.yield %4 : tensor<?x?xf32>
+ }
+ return %3 : tensor<?x?xf32>
+}
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 87f81dbbf1fd..a322b627756e 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -13,6 +13,7 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -82,6 +83,9 @@ struct TestLinalgTransforms
Option<bool> testTileAndPadPattern{
*this, "test-tile-and-pad-pattern",
llvm::cl::desc("Test tile and pad pattern"), llvm::cl::init(false)};
+ Option<bool> testHoistPadding2Levels{*this, "test-hoist-padding-2-level",
+ llvm::cl::desc("Test hoist padding"),
+ llvm::cl::init(false)};
};
} // end anonymous namespace
@@ -546,6 +550,11 @@ void TestLinalgTransforms::runOnFunction() {
return applyAffineMinSCFCanonicalizationPatterns(getFunction());
if (testTileAndPadPattern)
return applyTileAndPadPattern(getFunction());
+ if (testHoistPadding2Levels) {
+ getFunction().walk([](linalg::SimplePadOp simplePadOp) {
+ linalg::hoistPaddingOnTensors(simplePadOp, 2);
+ });
+ }
}
namespace mlir {
More information about the llvm-branch-commits
mailing list