[Mlir-commits] [mlir] [NFC] Simplify the tiling implementation using cloning. (PR #72178)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Nov 13 16:43:28 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-tensor
Author: None (MaheshRavishankar)
<details>
<summary>Changes</summary>
The current implementation of tiling using `scf.for` is convoluted to make sure that the destination passing style of the untiled program is preserved. The addition of support to tile using `scf.forall` (adapted from the transform operation in Linalg) in https://github.com/llvm/llvm-project/pull/67083 used cloning of the tiled operations to better streamline the implementation. This PR adapts the other tiling methods to use a similar approach, making the transformations (and handling destination passing style semantics) more systematic.
---
Patch is 44.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72178.diff
6 Files Affected:
- (modified) mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h (+6-6)
- (modified) mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp (+274-228)
- (modified) mlir/test/Dialect/Tensor/tiling.mlir (+2-2)
- (modified) mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir (+2-2)
- (modified) mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir (+82-44)
- (modified) mlir/test/lib/Interfaces/TilingInterface/TestTilingInterface.cpp (+2)
``````````diff
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 81325b62791c44b..2f8f337bb8057ce 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -83,6 +83,12 @@ FailureOr<SCFTilingResult> tileUsingSCFForOp(RewriterBase &rewriter,
TilingInterface op,
const SCFTilingOptions &options);
+/// Method to tile an op that implements the `TilingInterface` using
+/// `scf.forall`.
+FailureOr<SCFTilingResult>
+tileUsingSCFForallOp(RewriterBase &rewriter, TilingInterface op,
+ const SCFTilingOptions &options);
+
/// Options used to control tile + fuse.
struct SCFTileAndFuseOptions {
/// The tiling options used to control the tiling of the consumer.
@@ -93,12 +99,6 @@ struct SCFTileAndFuseOptions {
}
};
-/// Method to tile an op that implements the `TilingInterface` using
-/// `scf.forall`.
-FailureOr<SCFTilingResult>
-tileUsingSCFForallOp(RewriterBase &rewriter, TilingInterface op,
- const SCFTilingOptions &options);
-
/// Fuse the producer of the source of `candidateSliceOp` by computing the
/// required slice of the producer in-place. Note that the method
/// replaces the uses of `candidateSliceOp` with the tiled and fused producer
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index df162d29a48eb89..01102e786acf2be 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -128,6 +128,9 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
Operation *op,
ValueRange newDestArgs) {
Operation *clonedOp = rewriter.clone(*op);
+ if (newDestArgs.empty()) {
+ return clonedOp;
+ }
if (auto destinationStyleOp =
dyn_cast<DestinationStyleOpInterface>(clonedOp)) {
destinationStyleOp.getDpsInitsMutable().assign(newDestArgs);
@@ -139,15 +142,17 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
/// - In `offsets` and `sizes` return the multi-dimensional offset and size of
-/// the
-/// tile processed within the inner most loop.
+/// the tile processed within the inner most loop.
+/// Note that this methods adds `scf.yield` operation for all but the innermost
+/// loop. These yield the value returned by the immediately inner loop. The
+/// caller is expected to add the scf.yield operation for the innermost loop.
static SmallVector<scf::ForOp> generateTileLoopNest(
OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
ArrayRef<OpFoldResult> tileSizes, SmallVector<OpFoldResult> &offsets,
- SmallVector<OpFoldResult> &sizes) {
- assert(!loopRanges.empty() && "expected at least one loop range");
- assert(loopRanges.size() == tileSizes.size() &&
- "expected as many tile sizes as loop ranges");
+ SmallVector<OpFoldResult> &sizes, ValueRange destinationTensors = {}) {
+ if (loopRanges.empty()) {
+ return {};
+ }
OpBuilder::InsertionGuard guard(builder);
SmallVector<scf::ForOp> loops;
offsets.resize(loopRanges.size());
@@ -169,136 +174,98 @@ static SmallVector<scf::ForOp> generateTileLoopNest(
}
auto loop = builder.create<scf::ForOp>(
- loc, offset, size, tileSize, ValueRange{},
+ loc, offset, size, tileSize, destinationTensors,
[&](OpBuilder &bodyBuilder, Location bodyLoc, Value iv,
ValueRange /*iterArgs*/) {
sizes[loopRange.index()] =
getBoundedTileSize(bodyBuilder, bodyLoc, loopRange.value(), iv,
getAsOpFoldResult(tileSize));
- builder.create<scf::YieldOp>(loc);
});
offsets[loopRange.index()] = loop.getInductionVar();
loops.push_back(loop);
- builder.setInsertionPoint(loop.getBody()->getTerminator());
+ builder.setInsertionPointToEnd(loop.getBody());
+ destinationTensors = loop.getRegionIterArgs();
}
- return loops;
-}
-/// For a value to be yielded (`yieldedValue`) from within a loop nest `loops`,
-/// construct the destructive update pattern that inserts the yielded
-/// value into a destination tensor provided by `initValue` at offset
-/// `tileOffsets` and size `tileSizes`. For example,
-///
-/// ```mlir
-/// scf.for %iv0 = ... {
-/// %0 = tiled_op
-/// }
-/// ```
-///
-/// is transformed to
-///
-/// ```mlir
-/// scf.for %iv0 = ... iter_args(%arg = %0) {
-/// %1 = tensor.extract_slice %arg
-/// %2 = tiled_op
-/// %3 = tensor.insert_slice %2 into %arg
-/// scf.yield %3
-/// }
-/// ```
-/// TODO: This API can be cleaned up by using `SubsetExtractOpInterface`.
-static SmallVector<Value>
-yieldTiledValues(RewriterBase &rewriter, ValueRange initValues,
- ValueRange yieldedValues,
- ArrayRef<SmallVector<OpFoldResult>> tileOffsetsList,
- ArrayRef<SmallVector<OpFoldResult>> tileSizesList,
- MutableArrayRef<scf::ForOp> loops) {
- NewYieldValuesFn yieldValueFn =
- [&](OpBuilder &b, Location loc,
- ArrayRef<BlockArgument> newBBArgs) -> SmallVector<Value> {
- SmallVector<Value> inserts;
- for (const auto &yieldedValue : llvm::enumerate(yieldedValues)) {
- ArrayRef<OpFoldResult> tileOffsets =
- tileOffsetsList[yieldedValue.index()];
- ArrayRef<OpFoldResult> tileSizes = tileSizesList[yieldedValue.index()];
- SmallVector<OpFoldResult> tileStrides(tileOffsets.size(),
- b.getIndexAttr(1));
- Value insert = b.create<tensor::InsertSliceOp>(
- loc, yieldedValue.value(), newBBArgs[yieldedValue.index()],
- tileOffsets, tileSizes, tileStrides);
- inserts.push_back(insert);
- }
- return inserts;
- };
-
- SmallVector<scf::ForOp> newLoops =
- replaceLoopNestWithNewYields(rewriter, loops, initValues, yieldValueFn,
- /*replaceIterOperandsUsesInLoop =*/false);
- for (const auto &loop : llvm::enumerate(loops)) {
- loops[loop.index()] = newLoops[loop.index()];
+ // Add the scf.yield operations for all the outer loops.
+ for (auto [outerLoop, innerLoop] :
+ llvm::zip(MutableArrayRef(loops).drop_back(),
+ MutableArrayRef(loops).drop_front())) {
+ builder.setInsertionPointToEnd(outerLoop.getBody());
+ builder.create<scf::YieldOp>(outerLoop.getLoc(), innerLoop.getResults());
}
- return llvm::to_vector(llvm::map_range(
- loops.front().getResults().take_back(yieldedValues.size()),
- [](OpResult r) -> Value { return r; }));
+ return loops;
}
-/// If the tiled operation is destination passing style, update the
-/// slice of the destination used (which refers to the untiled destination)
-/// to use the corresponding region argument of the innermost loop.
-///
-/// ```mlir
-/// %0 =
-/// scf.for %iv0 = ... iter_args(%arg = %0) {
-/// %1 = tensor.extract_slice %0
-/// %2 = tiled_op
-/// %3 = tensor.insert_slice %2 into %arg
-/// scf.yield %3
-/// }
-/// ```
-///
-/// is transformed to
-///
-/// ```mlir
-/// scf.for %iv0 = ... iter_args(%arg = %0) {
-/// %1 = tensor.extract_slice %arg
-/// %2 = tiled_op
-/// %3 = tensor.insert_slice %2 into %arg
-/// scf.yield %3
-/// }
-/// ```
-static void
-updateDestinationOperandsForTiledOp(OpBuilder &builder,
- ValueRange tiledOpDestinationValues,
- ValueRange bbArgsList) {
- for (const auto &destValue : llvm::enumerate(tiledOpDestinationValues)) {
- auto sliceOp = destValue.value().getDefiningOp<tensor::ExtractSliceOp>();
- if (!sliceOp)
- continue;
- sliceOp.setOperand(0, bbArgsList[destValue.index()]);
+/// Method to add new init values to a loop nest. Updates `loops` in-place with
+/// new loops that use the `newInitValues`.
+/// The outer-loops are updated to yield the new result values of the inner
+/// loop. For the innermost loop, the call back `getNewYields` is invoked to get
+/// the additional values to yield form the innermost loop.
+static void addInitOperandsToLoopNest(
+ RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loops,
+ ValueRange newInitValues,
+ llvm::function_ref<SmallVector<Value>(RewriterBase &rewriter, Value iv,
+ ValueRange newRegionIterArgs)>
+ getNewYieldValsFn) {
+ SmallVector<scf::ForOp> newLoops;
+ if (loops.empty()) {
+ return;
+ }
+ OpBuilder::InsertionGuard g(rewriter);
+ rewriter.setInsertionPoint(loops.front());
+ for (auto &loop : loops) {
+ rewriter.setInsertionPoint(loop);
+
+ // Create a new loop with the new init values for this loop.
+ SmallVector<Value> newInits = llvm::to_vector(loop.getInitArgs());
+ newInits.append(newInitValues.begin(), newInitValues.end());
+ auto newLoop = rewriter.create<scf::ForOp>(
+ loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(),
+ loop.getStep(), newInits,
+ [&](OpBuilder &b, Location loc, Value iv, ValueRange iterArgs) {});
+
+ // Merge the body of the new loop with the body of the old loops.
+ SmallVector<Value> sourceBlockArgs;
+ sourceBlockArgs.push_back(newLoop.getInductionVar());
+ auto newRegionIterArgs = newLoop.getRegionIterArgs();
+ sourceBlockArgs.append(
+ newRegionIterArgs.begin(),
+ std::next(newRegionIterArgs.begin(), loop.getNumResults()));
+ rewriter.mergeBlocks(loop.getBody(), newLoop.getBody(), sourceBlockArgs);
+ rewriter.replaceOp(loop,
+ newLoop.getResults().take_front(loop.getNumResults()));
+ loop = newLoop;
+ newInitValues = newLoop.getRegionIterArgs().take_back(newInitValues.size());
}
-}
-/// Helper method to yield the values of the tiled op, as well as
-/// update the destination operands of the tiled op, if it is
-/// a destination passing style op.
-static SmallVector<Value>
-yieldTiledValues(RewriterBase &rewriter, ArrayRef<Value> initValues,
- TilingResult tilingResult,
- ArrayRef<SmallVector<OpFoldResult>> tileOffsetsList,
- ArrayRef<SmallVector<OpFoldResult>> tileSizesList,
- MutableArrayRef<scf::ForOp> loops) {
- SmallVector<Value> replacements =
- yieldTiledValues(rewriter, initValues, tilingResult.tiledValues,
- tileOffsetsList, tileSizesList, loops);
- for (auto tiledOp : tilingResult.tiledOps) {
- if (auto dstOp = dyn_cast<DestinationStyleOpInterface>(tiledOp)) {
- auto innerMostLoop = loops.back();
- SmallVector<Value> tiledOpDestinationTensors =
- llvm::to_vector(dstOp.getDpsInits());
- updateDestinationOperandsForTiledOp(rewriter, tiledOpDestinationTensors,
- innerMostLoop.getRegionIterArgs());
- }
+ // Update the loop body of the innermost loop to get new yield values.
+ scf::ForOp innerMostLoop = loops.back();
+ auto innerMostYieldOp =
+ cast<scf::YieldOp>(innerMostLoop.getBody()->getTerminator());
+ rewriter.setInsertionPoint(innerMostYieldOp);
+ SmallVector<Value> newYieldVals =
+ getNewYieldValsFn(rewriter, innerMostLoop.getInductionVar(),
+ innerMostLoop.getRegionIterArgs());
+ SmallVector<Value> newYieldOperands =
+ llvm::to_vector(innerMostYieldOp->getOperands());
+ newYieldOperands.append(newYieldVals);
+ rewriter.replaceOpWithNewOp<scf::YieldOp>(innerMostYieldOp, newYieldOperands);
+
+ // Make all other loops except the innermost loops yield the values returned
+ // by the inner loop.
+ for (auto [outerLoop, innerLoop] :
+ llvm::zip(loops.drop_back(), loops.drop_front())) {
+ auto outerLoopYield =
+ cast<scf::YieldOp>(outerLoop.getBody()->getTerminator());
+ SmallVector<Value> newYields =
+ llvm::to_vector(outerLoopYield.getOperands());
+ ValueRange additionalYields =
+ innerLoop.getResults().take_back(newInitValues.size());
+ newYields.append(additionalYields.begin(), additionalYields.end());
+ rewriter.setInsertionPoint(outerLoopYield);
+ rewriter.replaceOpWithNewOp<scf::YieldOp>(outerLoopYield, newYields);
}
- return replacements;
}
/// Implementation of tiling transformation of `op` that implements the
@@ -317,10 +284,6 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
// 1. Get the range of the loops that are represented by the operation.
SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
size_t numLoops = iterationDomain.size();
- if (numLoops == 0) {
- return rewriter.notifyMatchFailure(
- op, "unable to tile op with no iteration domain");
- }
// 2. Materialize the tile sizes. Enforce the convention that "tiling by zero"
// skips tiling a particular dimension. This convention is significantly
@@ -333,6 +296,14 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
}
+ // 3. Find the destination tensors to use for the operation.
+ SmallVector<Value> destinationTensors;
+ if (failed(tensor::getOrCreateDestinations(rewriter, op.getLoc(), op,
+ destinationTensors))) {
+ return rewriter.notifyMatchFailure(op,
+ "unable to create destination tensors");
+ }
+
SmallVector<OpFoldResult> offsets, sizes;
SmallVector<scf::ForOp> forLoops;
{
@@ -354,11 +325,12 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
applyPermutationToVector(tileSizeVector, interchangeVector);
}
- // 3. Materialize an empty loop nest that iterates over the tiles. These
+ // 4. Materialize an empty loop nest that iterates over the tiles. These
// loops for now do not return any values even if the original operation has
// results.
forLoops = generateTileLoopNest(rewriter, op.getLoc(), iterationDomain,
- tileSizeVector, offsets, sizes);
+ tileSizeVector, offsets, sizes,
+ destinationTensors);
if (!interchangeVector.empty()) {
auto inversePermutation = invertPermutationVector(interchangeVector);
@@ -375,17 +347,29 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
}
});
- // 4. Generate the tiled implementation within the inner most loop.
- if (!forLoops.empty())
- rewriter.setInsertionPoint(forLoops.back().getBody()->getTerminator());
- FailureOr<TilingResult> tiledImplementation =
- op.getTiledImplementation(rewriter, offsets, sizes);
+ // 5. Generate the tiled implementation within the inner most loop.
+ SmallVector<Value> clonedOpDestination = destinationTensors;
+ if (!forLoops.empty()) {
+ rewriter.setInsertionPointToEnd(forLoops.back().getBody());
+ clonedOpDestination =
+ llvm::map_to_vector(forLoops.back().getRegionIterArgs(),
+ [](BlockArgument b) -> Value { return b; });
+ }
- if (op->getNumResults() == 0) {
- return scf::SCFTilingResult{
- tiledImplementation->tiledOps, getAsOperations(forLoops), {}};
+ // 5a. Clone the operation within the loop body.
+ auto clonedOp = cast<TilingInterface>(
+ cloneOpAndUpdateDestinationArgs(rewriter, op, clonedOpDestination));
+
+ // 5b. Tile the cloned operation.
+ FailureOr<TilingResult> tiledImplementation =
+ clonedOp.getTiledImplementation(rewriter, offsets, sizes);
+ if (failed(tiledImplementation)) {
+ return rewriter.notifyMatchFailure(op, "failed to tile operation");
}
+ // 5c. Delete the cloned operation.
+ rewriter.eraseOp(clonedOp);
+
// If loops are empty, the tiled op is used as the replacement for the untiled
// op.
if (forLoops.empty()) {
@@ -394,30 +378,39 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
tiledImplementation->tiledValues};
}
- // 5. Yield all the results of the tiled operation. The surrounding loop
- // nest is modified to insert a destructive update pattern to yield
- // from the loop nest values to replace the untiled op with.
+ if (op->getNumResults() == 0) {
+ // The innermost loop does not have a `scf.yield` yet. There is nothing to
+ // return, so generate an empty `scf.yield` operation.
+ rewriter.setInsertionPointToEnd(forLoops.back().getBody());
+ rewriter.create<scf::YieldOp>(op->getLoc());
+ return scf::SCFTilingResult{
+ tiledImplementation->tiledOps, getAsOperations(forLoops), {}};
+ }
+
+ // 6. Yield all the results of the tiled operation.
int64_t numResults = op->getNumResults();
SmallVector<SmallVector<OpFoldResult>> resultOffsetsList(numResults),
resultSizesList(numResults);
- for (const auto &result : llvm::enumerate(op->getResults())) {
- if (failed(op.getResultTilePosition(rewriter, result.index(), offsets,
- sizes,
- resultOffsetsList[result.index()],
- resultSizesList[result.index()]))) {
+ SmallVector<Value> yieldedValues;
+ for (auto [index, tiledValue] :
+ llvm::enumerate(tiledImplementation->tiledValues)) {
+ SmallVector<OpFoldResult> resultOffsets, resultSizes;
+ if (failed(op.getResultTilePosition(rewriter, index, offsets, sizes,
+ resultOffsets, resultSizes))) {
return rewriter.notifyMatchFailure(
op, "failed to get slice of result produced");
}
+ SmallVector<OpFoldResult> resultStrides(resultOffsets.size(),
+ rewriter.getIndexAttr(1));
+ auto insertSlice = rewriter.create<tensor::InsertSliceOp>(
+ op->getLoc(), tiledValue, clonedOpDestination[index], resultOffsets,
+ resultSizes, resultStrides);
+ yieldedValues.push_back(insertSlice);
}
+ rewriter.create<scf::YieldOp>(op->getLoc(), yieldedValues);
- SmallVector<Value> destinationTensors;
- if (failed(tensor::getOrCreateDestinations(rewriter, op.getLoc(), op,
- destinationTensors)))
- return rewriter.notifyMatchFailure(op, "failed to get destinations");
-
- SmallVector<Value> replacements = yieldTiledValues(
- rewriter, destinationTensors, tiledImplementation.value(),
- resultOffsetsList, resultSizesList, forLoops);
+ SmallVector<Value> replacements = llvm::map_to_vector(
+ forLoops.front().getResults(), [](OpResult r) -> Value { return r; });
LLVM_DEBUG({
if (!forLoops.empty()) {
llvm::dbgs() << "After tiled implementation :\n";
@@ -457,42 +450,59 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
reductionDims.push_back(idx);
}
- // 1. create the inital tensor value.
+ // 2. create the inital tensor value.
FailureOr<Operation *> identityTensor =
op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector,
reductionDims);
if (failed(identityTensor))
return b.notifyMatchFailure(op,
"cannot create a tensor of identity value.");
- // 2. Create the nested loops.
+ // 3. Create the nested loops.
SmallVector<OpFoldResult> offsets, sizes;
- SmallVector<scf::ForOp> loops = generateTileLoopNest(
- b, loc, iterationDomain, tileSizesVector, offsets, sizes);
+ SmallVector<scf::ForOp> loops =
+ generateTileLoopNest(b, loc, iterationDomain, tileSizesVector, offsets,
+ sizes, identityTensor.value()->getResults());
- // 3. Generate the tiled implementation within the inner most loop.
- b.setInsertionPoint(loops.back().getBody()->getTerminator());
- Operation *parallelOp = op.tileToPartialReduction(
- b, loc, (*identityTensor)->getResults(), offsets, sizes, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/72178
More information about the Mlir-commits
mailing list