[llvm-branch-commits] [mlir] 7af3f6e - Revert "[mlir][SCF] Allow using a custom operation to generate loops with `ml…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Sep 18 09:29:32 PDT 2025
Author: MaheshRavishankar
Date: 2025-09-18T09:29:29-07:00
New Revision: 7af3f6e0317e84900e6683ac0ea3dc60b805904e
URL: https://github.com/llvm/llvm-project/commit/7af3f6e0317e84900e6683ac0ea3dc60b805904e
DIFF: https://github.com/llvm/llvm-project/commit/7af3f6e0317e84900e6683ac0ea3dc60b805904e.diff
LOG: Revert "[mlir][SCF] Allow using a custom operation to generate loops with `ml…"
This reverts commit b8649098a7fcf598406d8d8b7d68891d1444e9c8.
Added:
Modified:
mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
Removed:
mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir
################################################################################
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 6b05ade37881c..3205da6e448fc 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -33,14 +33,6 @@ using SCFTileSizeComputationFunction =
/// Options to use to control tiling.
struct SCFTilingOptions {
- /// Specify which loop construct to use for tile and fuse.
- enum class LoopType { ForOp, ForallOp, CustomOp };
- LoopType loopType = LoopType::ForOp;
- SCFTilingOptions &setLoopType(LoopType type) {
- loopType = type;
- return *this;
- }
-
/// Computation function that returns the tile sizes to use for each loop.
/// Returning a tile size of zero implies no tiling for that loop. If the
/// size of the returned vector is smaller than the number of loops, the inner
@@ -58,17 +50,6 @@ struct SCFTilingOptions {
/// proper interaction with folding.
SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> tileSizes);
- /// The interchange vector to reorder the tiled loops.
- SmallVector<int64_t> interchangeVector = {};
- SCFTilingOptions &setInterchange(ArrayRef<int64_t> interchange) {
- interchangeVector = llvm::to_vector(interchange);
- return *this;
- }
-
- //-------------------------------------------------------------------------//
- // Options related to tiling using `scf.forall`.
- //-------------------------------------------------------------------------//
-
/// Computation function that returns the number of threads to use for
/// each loop. Returning a num threads of zero implies no tiling for that
/// loop. If the size of the returned vector is smaller than the number of
@@ -89,6 +70,21 @@ struct SCFTilingOptions {
/// function that computes num threads at the point they are needed.
SCFTilingOptions &setNumThreads(ArrayRef<OpFoldResult> numThreads);
+ /// The interchange vector to reorder the tiled loops.
+ SmallVector<int64_t> interchangeVector = {};
+ SCFTilingOptions &setInterchange(ArrayRef<int64_t> interchange) {
+ interchangeVector = llvm::to_vector(interchange);
+ return *this;
+ }
+
+ /// Specify which loop construct to use for tile and fuse.
+ enum class LoopType { ForOp, ForallOp };
+ LoopType loopType = LoopType::ForOp;
+ SCFTilingOptions &setLoopType(LoopType type) {
+ loopType = type;
+ return *this;
+ }
+
/// Specify mapping of loops to devices. This is only respected when the loop
/// constructs support such a mapping (like `scf.forall`). Will be ignored
/// when using loop constructs that dont support such a mapping (like
@@ -121,98 +117,6 @@ struct SCFTilingOptions {
reductionDims.insert(dims.begin(), dims.end());
return *this;
}
-
- //-------------------------------------------------------------------------//
- // Options related to tiling using custom loop.
- //-------------------------------------------------------------------------//
-
- // For generating the inter-tile loops using a custom loop, two callback
- // functions are needed
- // 1. That generates the "loop header", i.e. the loop that iterates over the
- //
diff erent tiles.
- // 2. That generates the loop terminator
- //
- // For `scf.forall` case the call back to generate loop header would generate
- //
- // ```mlir
- // scf.forall (...) = ... {
- // ..
- // }
- // ```
- //
- // and the call back to generate the loop terminator would generate the
- // `scf.in_parallel` region
- //
- // ```mlir
- // scf.forall (...) = ... {
- // scf.in_parallel {
- // tensor.parallel_insert_slice ...
- // }
- // }
- // ```
- //
-
- // Information that is to be returned by the callback to generate the loop
- // header needed for the rest of the tiled codegeneration.
- // - `loops`: The generated loops
- // - `tileOffset`: The values that represent the offset of the iteration space
- // tile
- // - `tileSizes` : The values that represent the size of the iteration space
- // tile.
- // - `destinationTensors` : The tensors to use as destinations during tiling.
- struct CustomLoopHeaderInfo {
- SmallVector<LoopLikeOpInterface> loops;
- SmallVector<OpFoldResult> tileOffset;
- SmallVector<OpFoldResult> tileSizes;
- SmallVector<Value> destinationTensors;
- };
-
- // Type of the callback function that generates the loop headers.
- // - `loopRanges` : Values that represent the full size of the iteration space
- // being tiled.
- // - `giveTileSizes` : The tile sizes that are to be used to tile the
- // iteration
- // space.
- // - `destinationTensors` : The tensors to use as destinations for the results
- // of the tiled loop for loops that implement
- // `DestinationStyleOpInterface`.
- // Returns the `CustomLoopHeaderInfo` object (described above). it is expected
- // that this function sets the insertion point of `rewriter` to the program
- // point where the intra-tile loop computation is to be generated.
- using GenerateLoopHeaderFn = std::function<FailureOr<CustomLoopHeaderInfo>(
- RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes, ValueRange destinationTensors)>;
-
- // Type of the callback function that generates the loop terminator.
- // - `tiledResults` : Tiles of the result computed for the iteration space
- // tile
- // - `resultOffsets` : For each of the `tiledResults`, the offset at which
- // the result tile is to be "inserted" back into the
- // destination tensor.
- // - `resultSizes` : For each of the `tiledResults`, the size of the result
- // tile
- // that is to be "inserted" back into the destination
- // tensor.
- // Returns the `CustomLoopHeaderInfo` object (described above)
- using GenerateLoopTerminatorFn = std::function<LogicalResult(
- RewriterBase &rewriter, Location loc, ValueRange tiledResults,
- ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
- ArrayRef<SmallVector<OpFoldResult>> resultSizes,
- ValueRange destinationTensors)>;
-
- // Callback function to generate the inter-tile loop header.
- GenerateLoopHeaderFn generateLoopHeaderFn = nullptr;
- // Callback function to generate the inter-tile loop terminator.
- GenerateLoopTerminatorFn generateLoopTerminatorFn = nullptr;
- // Helper function to set the callbacks for inter-tile loop header and
- // terminator functions when using a custom operation for the loop.
- SCFTilingOptions &
- setCustomLoopGenerationFns(GenerateLoopHeaderFn headerFn,
- GenerateLoopTerminatorFn terminatorFn) {
- generateLoopHeaderFn = std::move(headerFn);
- generateLoopTerminatorFn = std::move(terminatorFn);
- return *this;
- }
};
/// Transformation information returned after tiling.
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index c3899473289e2..834c02126fa53 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -155,18 +155,18 @@ getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op,
static LogicalResult checkTileSizes(TilingInterface op,
scf::SCFTilingOptions::LoopType loopType,
ReductionTilingStrategy reductionStrategy,
- ArrayRef<OpFoldResult> givenTileSizes,
+ ArrayRef<OpFoldResult> tileSizes,
ArrayRef<OpFoldResult> numThreads) {
auto iterators = op.getLoopIteratorTypes();
- assert(iterators.size() == givenTileSizes.size() &&
+ assert(iterators.size() == tileSizes.size() &&
"expected as many tile size values as number of loops");
assert((numThreads.empty() || (numThreads.size() == iterators.size())) &&
"when specified, expected number of threads to use for each loop");
bool isParallelTiling = false;
- for (auto [index, iterator, givenTileSize] :
- llvm::enumerate(iterators, givenTileSizes)) {
- if (!isConstantIntValue(givenTileSize, 0)) {
+ for (auto [index, iterator, tileSize] :
+ llvm::enumerate(iterators, tileSizes)) {
+ if (!isConstantIntValue(tileSize, 0)) {
isParallelTiling |= iterator == utils::IteratorType::parallel;
}
@@ -186,7 +186,7 @@ static LogicalResult checkTileSizes(TilingInterface op,
}
if (std::optional<int64_t> constTileSize =
- getConstantIntValue(givenTileSize)) {
+ getConstantIntValue(tileSize)) {
if (constTileSize.value() > 0 &&
iterator != utils::IteratorType::parallel) {
op.emitWarning() << "tiling is not thread safe at axis #" << index;
@@ -207,11 +207,11 @@ static LogicalResult checkTileSizes(TilingInterface op,
/// Get the reduction dims that are tiled. This accounts for reduction dims
/// that are specified as tiled, but the tile size is 0.
static SetVector<unsigned>
-getSanitizedReductionDims(ArrayRef<OpFoldResult> givenTileSizes,
+getSanitizedReductionDims(ArrayRef<OpFoldResult> tileSizes,
const scf::SCFTilingOptions &options) {
SetVector<unsigned> reductionDims;
for (auto dim : options.reductionDims) {
- if (isConstantIntValue(givenTileSizes[dim], 0))
+ if (isConstantIntValue(tileSizes[dim], 0))
continue;
reductionDims.insert(dim);
}
@@ -236,14 +236,14 @@ static bool tileDividesIterationDomain(Range loopRange) {
/// `tileSize`, i.e., `min(tileSize, range.end() - offset)`.
static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
Range loopRange, OpFoldResult offset,
- OpFoldResult givenTileSize) {
- std::optional<int64_t> ts = getConstantIntValue(givenTileSize);
+ OpFoldResult tileSize) {
+ std::optional<int64_t> ts = getConstantIntValue(tileSize);
if (ts && ts.value() == 1)
- return givenTileSize;
+ return tileSize;
if (tileDividesIterationDomain(
- Range{loopRange.offset, loopRange.size, givenTileSize}))
- return givenTileSize;
+ Range{loopRange.offset, loopRange.size, tileSize}))
+ return tileSize;
// The tile size to use (to avoid out of bounds access) is minimum of
// `tileSize` and `ub - iv`, where `iv` is the induction variable of the tiled
@@ -254,15 +254,15 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext());
Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size);
return affine::makeComposedFoldedAffineMin(
- b, loc, minMap, SmallVector<OpFoldResult>{offset, size, givenTileSize});
+ b, loc, minMap, SmallVector<OpFoldResult>{offset, size, tileSize});
}
/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
/// than `iterationSize`.
-static bool canOmitTileOffsetInBoundsCheck(OpFoldResult givenTileSize,
+static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
OpFoldResult numThreads,
OpFoldResult iterationSize) {
- std::optional<int64_t> tileSizeConst = getConstantIntValue(givenTileSize);
+ std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);
std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);
std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);
if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
@@ -274,51 +274,114 @@ static bool canOmitTileOffsetInBoundsCheck(OpFoldResult givenTileSize,
/// `offset`s and `size`s of the tile of the iteration space that the
/// innermost loop body of the generated tiled loops corresponds to.
static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
-getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
+getTileOffsetAndSizes(RewriterBase &rewriter, Location loc,
+ ReductionTilingStrategy strategy, ValueRange ivs,
ArrayRef<Range> iterationDomain,
- ArrayRef<OpFoldResult> givenTileSizes) {
+ ArrayRef<OpFoldResult> tileSizes,
+ ArrayRef<OpFoldResult> numThreads,
+ const llvm::SetVector<unsigned> &reductionDims) {
SmallVector<OpFoldResult> offsets, sizes;
int materializedLoopNum = 0;
- for (auto [givenTileSize, loopRange] :
- llvm::zip_equal(givenTileSizes, iterationDomain)) {
-
- // Non-tiled cases, set the offset and size to the
- // `loopRange.offset/size`.
- if (isZeroInteger(givenTileSize)) {
- offsets.push_back(loopRange.offset);
- sizes.push_back(loopRange.size);
- continue;
+
+ if (!numThreads.empty()) {
+ AffineExpr d0, d1, s0, s1;
+ AffineExpr offsetExpr, residualTileSizeExpr;
+ bindDims(rewriter.getContext(), d0, d1);
+ bindSymbols(rewriter.getContext(), s0, s1);
+ offsetExpr = d0 + d1 * s0;
+ residualTileSizeExpr = s1 - (d0 + d1 * s0);
+
+ for (auto [index, nt, tileSize, loopRange] :
+ llvm::enumerate(numThreads, tileSizes, iterationDomain)) {
+
+ // Non-tiled cases, set the offset and size to the
+ // `loopRange.offset/size`.
+ if (isZeroInteger(nt)) {
+ offsets.push_back(loopRange.offset);
+ sizes.push_back(loopRange.size);
+ continue;
+ }
+
+ Value iv = ivs[materializedLoopNum++];
+ OpFoldResult offset = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, offsetExpr,
+ ArrayRef<OpFoldResult>{loopRange.offset, iv, tileSize});
+ OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, residualTileSizeExpr,
+ {loopRange.offset, nt, tileSize, loopRange.size});
+
+ OpFoldResult size = tileSize;
+ if (!isZeroInteger(residualTileSize)) {
+ OpFoldResult sizeMinusOffsetPerThread =
+ affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
+ {offset, loopRange.size});
+ size = affine::makeComposedFoldedAffineMin(
+ rewriter, loc,
+ AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()),
+ {sizeMinusOffsetPerThread, tileSize});
+ }
+
+ // Consider the case where the original loop was `[0, 100)`.
+ // If number of threads are `7`, the tile size would be computed as
+ // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6)
+ // - `offset = 0 + 6 * 15 = 105`
+ // - `tileSize = min(15, 100 - 105) = -5`
+ // To avoid negative tile sizes, we need to do a further
+ // `nonNegativeTileSize = affine.max(0, tileSize)`.
+ // This `max` can be avoided if
+ // `offset + tileSize * (numThreads - 1) < (ub - lb)`
+ if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) {
+ AffineMap maxMap =
+ AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
+ size = affine::makeComposedFoldedAffineMax(
+ rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size});
+ }
+
+ offsets.push_back(offset);
+ sizes.push_back(size);
}
+ return {offsets, sizes};
+ } else {
+ for (auto [tileSize, loopRange] :
+ llvm::zip_equal(tileSizes, iterationDomain)) {
+
+ // Non-tiled cases, set the offset and size to the
+ // `loopRange.offset/size`.
+ if (isZeroInteger(tileSize)) {
+ offsets.push_back(loopRange.offset);
+ sizes.push_back(loopRange.size);
+ continue;
+ }
- Value iv = ivs[materializedLoopNum++];
- OpFoldResult offset = getAsOpFoldResult(iv);
- offsets.push_back(offset);
- OpFoldResult size =
- getBoundedTileSize(rewriter, loc, loopRange, offset, givenTileSize);
- sizes.push_back(size);
+ Value iv = ivs[materializedLoopNum++];
+ OpFoldResult offset = getAsOpFoldResult(iv);
+ offsets.push_back(offset);
+ OpFoldResult size =
+ getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize);
+ sizes.push_back(size);
+ }
+ return {offsets, sizes};
}
- return {offsets, sizes};
}
/// Function to return the bounds of the loops to be generated.
static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>,
SmallVector<OpFoldResult>>
getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes) {
+ ArrayRef<OpFoldResult> tileSizes) {
SmallVector<OpFoldResult> lbs, ubs, steps;
- for (auto [loopRange, givenTileSize] :
- llvm::zip_equal(loopRanges, givenTileSizes)) {
+ for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
// No loop if the tile size is 0.
- if (isZeroInteger(givenTileSize))
+ if (isZeroInteger(tileSize))
continue;
lbs.push_back(loopRange.offset);
ubs.push_back(loopRange.size);
- steps.push_back(givenTileSize);
+ steps.push_back(tileSize);
}
return {lbs, ubs, steps};
}
-/// Typedef for function that allows returning additional yielded values during
+/// A function that allows returning additional yielded values during
/// `yieldTiledValuesAndReplace`.
/// - `ivs` induction variable for the loop.
/// - `newBbArgs` basic block arguments corresponding to newly added iter_args.
@@ -339,30 +402,6 @@ using YieldTiledValuesFn = std::function<LogicalResult(
SmallVector<SmallVector<OpFoldResult>> &resultOffsets,
SmallVector<SmallVector<OpFoldResult>> &resultSizes)>;
-/// Typedef for function that implements the body of a tiled loop.
-/// - `ivs` induction variable for the loop.
-/// - `tileOffsets` represents offsets for the tiled iteration space.
-/// - `tileSizes` represents the sizes for the tiled iteraiton space.
-/// - `outerDestinationTensors` tensor that holds the result. Is same size
-/// as the destination operands of the original operations.
-/// - `tiledResults` results of the tiled computation, corresponds to
-/// tiles of the original operation computed by the loop body.
-/// Should be same size as the `destinationTensors`
-/// - `resultOffsets` is of the same size as `tiledResults` and represents
-/// the offset to use when writing the corresponding element from
-/// `tiledResults` into `destinationTensors`.
-/// - `resultOffsets` is of the same size as `tiledResults` and represents
-/// the size to use when writing the corresponding element from
-/// `tiledResults` into `destinationTensors`.
-/// In case the method needs to return `failure()` the method is expected
-/// to clean up any inserted operations.
-using GenerateTiledBodyFn = std::function<LogicalResult(
- RewriterBase &rewriter, Location Loc, ValueRange ivs,
- ArrayRef<OpFoldResult> tileOffsets, ArrayRef<OpFoldResult> tileSizes,
- ValueRange outerDestinationTensors, SmallVector<Value> &tiledResults,
- SmallVector<SmallVector<OpFoldResult>> &resultOffsets,
- SmallVector<SmallVector<OpFoldResult>> &resultSizes)>;
-
/// Clones the operation and updates the destination if the operation
/// implements the `DestinationStyleOpInterface`.
static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
@@ -378,25 +417,26 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
/// Generate the tile-loop nest using `scf.for` operation.
/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `givenTileSizes` is the tile sizes to use. Zero represent untiled loops.
-/// - `outerDestinationTensors` are the init values to use for the outer most
-/// loop.
-/// - `tiledBodyFn` is called to generated the loop body of the inner
+/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
+/// - `destinationTensors` are the init values to use for the outer most loop.
+/// - `yieldTiledValuesFn` is called to generated the loop body of the inner
/// most
/// loop.
-/// Returns the generated `scf.for` loops on success.
-static FailureOr<SmallVector<LoopLikeOpInterface>> generateLoopNestUsingForOp(
+/// - `loops` is an in-out parameter into which the generated loops are
+/// populated.
+static LogicalResult generateLoopNestUsingForOp(
RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes, ValueRange outerDestinationTensors,
- GenerateTiledBodyFn tiledBodyFn) {
+ ArrayRef<OpFoldResult> tileSizes, ValueRange destinationTensors,
+ YieldTiledValuesFn yieldTiledValuesFn,
+ SmallVector<LoopLikeOpInterface> &loops) {
assert(!loopRanges.empty() && "unexpected empty loop ranges");
- assert(loopRanges.size() == givenTileSizes.size() &&
+ assert(loopRanges.size() == tileSizes.size() &&
"expected as many tile sizes as loop ranges");
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> lbs, ubs, steps;
std::tie(lbs, ubs, steps) =
- getLoopBounds(rewriter, loc, loopRanges, givenTileSizes);
+ getLoopBounds(rewriter, loc, loopRanges, tileSizes);
SmallVector<Value> lbVals =
getValueOrCreateConstantIndexOp(rewriter, loc, lbs);
SmallVector<Value> ubVals =
@@ -405,42 +445,34 @@ static FailureOr<SmallVector<LoopLikeOpInterface>> generateLoopNestUsingForOp(
getValueOrCreateConstantIndexOp(rewriter, loc, steps);
SmallVector<Value> ivs;
- SmallVector<LoopLikeOpInterface> loops;
- ValueRange innerDestinationTensors(outerDestinationTensors);
for (auto [lb, ub, step] : llvm::zip_equal(lbVals, ubVals, stepVals)) {
auto loop =
- scf::ForOp::create(rewriter, loc, lb, ub, step, innerDestinationTensors,
+ scf::ForOp::create(rewriter, loc, lb, ub, step, destinationTensors,
[](OpBuilder &bodyBuilder, Location bodyLoc,
Value iv, ValueRange /*iterArgs*/) {});
loops.push_back(loop);
ivs.push_back(loop.getInductionVar());
rewriter.setInsertionPointToEnd(loop.getBody());
- innerDestinationTensors = loop.getRegionIterArgs();
+ destinationTensors = loop.getRegionIterArgs();
}
- // Compute the `offsets` and `sizes` to use for tiling.
- SmallVector<OpFoldResult> offsets, sizes;
- std::tie(offsets, sizes) =
- getTileOffsetAndSizes(rewriter, loc, ivs, loopRanges, givenTileSizes);
-
SmallVector<Value> tiledResults;
SmallVector<SmallVector<OpFoldResult>> resultOffsets, resultSizes;
- if (failed(tiledBodyFn(rewriter, loc, ivs, offsets, sizes,
- innerDestinationTensors, tiledResults, resultOffsets,
- resultSizes))) {
+ if (failed(yieldTiledValuesFn(rewriter, loc, ivs, destinationTensors,
+ tiledResults, resultOffsets, resultSizes))) {
return rewriter.notifyMatchFailure(
loc, "failed to generate inner tile loop body");
}
if (loops.empty())
- return loops;
+ return success();
- assert(tiledResults.size() == innerDestinationTensors.size() &&
+ assert(tiledResults.size() == destinationTensors.size() &&
"Number of results of body should be equal to number of iter args");
// 6. Yield all the results of the tiled operation.
SmallVector<Value> yieldedValues;
for (auto [tiledValue, destinationTensor, resultOffset, resultSize] :
- llvm::zip_equal(tiledResults, innerDestinationTensors, resultOffsets,
+ llvm::zip_equal(tiledResults, destinationTensors, resultOffsets,
resultSizes)) {
SmallVector<OpFoldResult> resultStride(resultOffset.size(),
rewriter.getIndexAttr(1));
@@ -459,108 +491,27 @@ static FailureOr<SmallVector<LoopLikeOpInterface>> generateLoopNestUsingForOp(
cast<scf::ForOp>(outerLoop.getOperation()).getBody());
scf::YieldOp::create(rewriter, outerLoop.getLoc(), innerLoop->getResults());
}
- return loops;
-}
-
-/// Compute the `OpFoldResult`s that represents the multi-dimensional
-/// `offset`s and `size`s of the tile of the iteration space that the
-/// innermost loop body of the generated tiled loops corresponds to
-/// when tiling using `forall` op. This is handle separately dut to
-/// the special case handling needed for when the tiling is done by
-/// specifying number of threads.
-static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
-getTileOffsetAndSizesWithForAllOp(RewriterBase &rewriter, Location loc,
- ValueRange ivs,
- ArrayRef<Range> iterationDomain,
- ArrayRef<OpFoldResult> givenTileSizes,
- ArrayRef<OpFoldResult> numThreads) {
- if (numThreads.empty()) {
- return getTileOffsetAndSizes(rewriter, loc, ivs, iterationDomain,
- givenTileSizes);
- }
-
- SmallVector<OpFoldResult> offsets, sizes;
- int materializedLoopNum = 0;
-
- AffineExpr d0, d1, s0, s1;
- AffineExpr offsetExpr, residualTileSizeExpr;
- bindDims(rewriter.getContext(), d0, d1);
- bindSymbols(rewriter.getContext(), s0, s1);
- offsetExpr = d0 + d1 * s0;
- residualTileSizeExpr = s1 - (d0 + d1 * s0);
-
- for (auto [index, nt, givenTileSize, loopRange] :
- llvm::enumerate(numThreads, givenTileSizes, iterationDomain)) {
-
- // Non-tiled cases, set the offset and size to the
- // `loopRange.offset/size`.
- if (isZeroInteger(nt)) {
- offsets.push_back(loopRange.offset);
- sizes.push_back(loopRange.size);
- continue;
- }
-
- Value iv = ivs[materializedLoopNum++];
- OpFoldResult offset = affine::makeComposedFoldedAffineApply(
- rewriter, loc, offsetExpr,
- ArrayRef<OpFoldResult>{loopRange.offset, iv, givenTileSize});
- OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
- rewriter, loc, residualTileSizeExpr,
- {loopRange.offset, nt, givenTileSize, loopRange.size});
-
- OpFoldResult size = givenTileSize;
- if (!isZeroInteger(residualTileSize)) {
- OpFoldResult sizeMinusOffsetPerThread =
- affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
- {offset, loopRange.size});
- size = affine::makeComposedFoldedAffineMin(
- rewriter, loc,
- AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()),
- {sizeMinusOffsetPerThread, givenTileSize});
- }
-
- // Consider the case where the original loop was `[0, 100)`.
- // If number of threads are `7`, the tile size would be computed as
- // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6)
- // - `offset = 0 + 6 * 15 = 105`
- // - `tileSize = min(15, 100 - 105) = -5`
- // To avoid negative tile sizes, we need to do a further
- // `nonNegativeTileSize = affine.max(0, tileSize)`.
- // This `max` can be avoided if
- // `offset + tileSize * (numThreads - 1) < (ub - lb)`
- if (!canOmitTileOffsetInBoundsCheck(givenTileSize, nt, loopRange.size)) {
- AffineMap maxMap =
- AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
- size = affine::makeComposedFoldedAffineMax(
- rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size});
- }
-
- offsets.push_back(offset);
- sizes.push_back(size);
- }
- return {offsets, sizes};
+ return success();
}
/// Generate the tile-loop nest using `scf.forall` operation.
/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `giventileSizes` is the tile sizes to use. Zero represent untiled loops.
-/// - `outerDestinationTensors` are the init values to use for the loop.
+/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
+/// - `destinationTensors` are the init values to use for the outer most loop.
/// - `mappingVector` is the mapping attributes to use for loop construction.
/// Can be empty.
-/// - `tiledBodyFn` is called to generated the loop body of the inner
+/// - `yieldTiledValuesFn` is called to generated the loop body of the inner
/// most
/// loop.
-/// Returns the generated `scf.forall` loop on success.
-static FailureOr<SmallVector<LoopLikeOpInterface>>
-generateLoopNestUsingForallOp(RewriterBase &rewriter, Location loc,
- ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes,
- ArrayRef<OpFoldResult> numThreads,
- ArrayRef<Attribute> mappingVector,
- ValueRange outerDestinationTensors,
- GenerateTiledBodyFn tiledBodyFn) {
+/// - `loops` is an in-out parameter into which the generated loops are
+/// populated.
+static LogicalResult generateLoopNestUsingForallOp(
+ RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
+ ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads,
+ ArrayRef<Attribute> mappingVector, ValueRange destinationTensors,
+ YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
assert(!loopRanges.empty() && "unexpected empty loop ranges");
- assert(loopRanges.size() == givenTileSizes.size() &&
+ assert(loopRanges.size() == tileSizes.size() &&
"expected as many tile sizes as loop ranges");
OpBuilder::InsertionGuard guard(rewriter);
@@ -571,7 +522,6 @@ generateLoopNestUsingForallOp(RewriterBase &rewriter, Location loc,
scf::ForallOp forallOp;
bool useNumThreads = !numThreads.empty();
- SmallVector<LoopLikeOpInterface> loops;
if (useNumThreads) {
// Prune the zero numthreads.
SmallVector<OpFoldResult> nonZeroNumThreads;
@@ -581,35 +531,29 @@ generateLoopNestUsingForallOp(RewriterBase &rewriter, Location loc,
nonZeroNumThreads.push_back(nt);
}
forallOp = scf::ForallOp::create(rewriter, loc, nonZeroNumThreads,
- outerDestinationTensors, mappingAttr);
+ destinationTensors, mappingAttr);
} else {
SmallVector<OpFoldResult> lbs, ubs, steps;
std::tie(lbs, ubs, steps) =
- getLoopBounds(rewriter, loc, loopRanges, givenTileSizes);
+ getLoopBounds(rewriter, loc, loopRanges, tileSizes);
forallOp = scf::ForallOp::create(rewriter, loc, lbs, ubs, steps,
- outerDestinationTensors, mappingAttr);
+ destinationTensors, mappingAttr);
}
loops.push_back(forallOp);
rewriter.setInsertionPoint(forallOp.getTerminator());
- ValueRange innerDestinationTensors = forallOp.getRegionOutArgs();
- SmallVector<Value> ivs = forallOp.getInductionVars();
-
- // Compute the `offsets` and `sizes` to use for tiling.
- SmallVector<OpFoldResult> offsets, sizes;
- std::tie(offsets, sizes) = getTileOffsetAndSizesWithForAllOp(
- rewriter, loc, ivs, loopRanges, givenTileSizes, numThreads);
+ destinationTensors = forallOp.getRegionOutArgs();
SmallVector<Value> tiledResults;
SmallVector<SmallVector<OpFoldResult>> resultOffsets, resultSizes;
- if (failed(tiledBodyFn(rewriter, loc, ivs, offsets, sizes,
- innerDestinationTensors, tiledResults, resultOffsets,
+ if (failed(tiledBodyFn(rewriter, loc, forallOp.getInductionVars(),
+ destinationTensors, tiledResults, resultOffsets,
resultSizes)))
return rewriter.notifyMatchFailure(loc, "failed to generate loop body");
rewriter.setInsertionPointToEnd(forallOp.getTerminator().getBody());
for (auto [tiledValue, destinationTensor, resultOffset, resultSize] :
- llvm::zip_equal(tiledResults, innerDestinationTensors, resultOffsets,
+ llvm::zip_equal(tiledResults, destinationTensors, resultOffsets,
resultSizes)) {
SmallVector<OpFoldResult> resultStride(resultOffset.size(),
rewriter.getIndexAttr(1));
@@ -618,105 +562,41 @@ generateLoopNestUsingForallOp(RewriterBase &rewriter, Location loc,
destinationTensor, resultOffset,
resultSize, resultStride);
}
- return loops;
-}
-
-/// Generate the tile-loop nest using custom loop operation.
-/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
-/// - `destinationTensors` are the init values to use for the outer most loop.
-/// - `mappingVector` is the mapping attributes to use for loop construction.
-/// Can be empty.
-/// - `tiledBodyFn` is called to generated the loop body of the inner
-/// most
-/// loop.
-/// Returns the generated `scf.forall` loop on success.
-static FailureOr<SmallVector<LoopLikeOpInterface>>
-generateLoopNestUsingCustomOp(
- RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes, ValueRange outerDestinationTensors,
- const scf::SCFTilingOptions::GenerateLoopHeaderFn &generateLoopHeaderFn,
- const scf::SCFTilingOptions::GenerateLoopTerminatorFn
- &generateLoopTerminatorFn,
- GenerateTiledBodyFn tiledBodyFn) {
- assert(!loopRanges.empty() && "unexpected empty loop ranges");
- assert(loopRanges.size() == givenTileSizes.size() &&
- "expected as many tile sizes as loop ranges");
- assert(generateLoopHeaderFn && generateLoopTerminatorFn &&
- "expected loop header/terminator generation function");
- OpBuilder::InsertionGuard guard(rewriter);
-
- FailureOr<scf::SCFTilingOptions::CustomLoopHeaderInfo> loopHeaderInfo =
- generateLoopHeaderFn(rewriter, loc, loopRanges, givenTileSizes,
- outerDestinationTensors);
- if (failed(loopHeaderInfo)) {
- return failure();
- }
-
- SmallVector<Value> ivs;
- SmallVector<Value> tiledResults;
- SmallVector<SmallVector<OpFoldResult>> resultOffsets, resultSizes;
- if (failed(tiledBodyFn(rewriter, loc, ivs, loopHeaderInfo->tileOffset,
- loopHeaderInfo->tileSizes,
- loopHeaderInfo->destinationTensors, tiledResults,
- resultOffsets, resultSizes))) {
- return failure();
- }
-
- if (failed(generateLoopTerminatorFn(rewriter, loc, tiledResults,
- resultOffsets, resultSizes,
- loopHeaderInfo->destinationTensors))) {
- return failure();
- }
-
- return loopHeaderInfo->loops;
+ return success();
}
/// Generate the tile-loop nest using the loop construct specifed in `options`.
/// - `options`: Tiling options specified.
/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
-/// - `outerDestinationTensors` are the init values to use for the outer most
-/// loop.
+/// - `destinationTensors` are the init values to use for the outer most loop.
/// - `yieldTiledValuesFn` is called to generated the loop body of the inner
/// most
/// loop.
-/// Returns the generated loops on success.
-static FailureOr<SmallVector<LoopLikeOpInterface>> generateLoopNest(
- RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options,
- ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> givenTileSizes,
- ArrayRef<OpFoldResult> numThreads, ValueRange destinationTensors,
- GenerateTiledBodyFn tiledBodyFn) {
+/// - `loops` is an in-out parameter into which the generated loops are
+/// populated.
+static LogicalResult generateLoopNest(
+ RewriterBase &rewriter, Location loc,
+ scf::SCFTilingOptions::LoopType loopType, ArrayRef<Range> loopRanges,
+ ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads,
+ ValueRange destinationTensors, ArrayRef<Attribute> mappingVector,
+ YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
// If the tile sizes are all zero, no loops are generated. Just call the
// callback function to handle untiled case.
- if (llvm::all_of(givenTileSizes, isZeroInteger)) {
+ if (llvm::all_of(tileSizes, isZeroInteger)) {
SmallVector<Value> tiledResults;
SmallVector<SmallVector<OpFoldResult>> resultOffsets, resultSizes;
- auto tileOffsets =
- llvm::map_to_vector(loopRanges, [](Range r) { return r.offset; });
- auto tileSizes =
- llvm::map_to_vector(loopRanges, [](Range r) { return r.size; });
- if (failed(tiledBodyFn(rewriter, loc, ValueRange{}, tileOffsets, tileSizes,
- destinationTensors, tiledResults, resultOffsets,
- resultSizes))) {
- return failure();
- }
- return SmallVector<LoopLikeOpInterface>{};
+ return tiledBodyFn(rewriter, loc, ValueRange{}, destinationTensors,
+ tiledResults, resultOffsets, resultSizes);
}
- if (options.loopType == scf::SCFTilingOptions::LoopType::ForOp) {
- return generateLoopNestUsingForOp(rewriter, loc, loopRanges, givenTileSizes,
- destinationTensors, tiledBodyFn);
+ if (loopType == scf::SCFTilingOptions::LoopType::ForOp) {
+ return generateLoopNestUsingForOp(rewriter, loc, loopRanges, tileSizes,
+ destinationTensors, tiledBodyFn, loops);
}
- if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
+ if (loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
return generateLoopNestUsingForallOp(
- rewriter, loc, loopRanges, givenTileSizes, numThreads,
- options.mappingVector, destinationTensors, tiledBodyFn);
- }
- if (options.loopType == scf::SCFTilingOptions::LoopType::CustomOp) {
- return generateLoopNestUsingCustomOp(
- rewriter, loc, loopRanges, givenTileSizes, destinationTensors,
- options.generateLoopHeaderFn, options.generateLoopTerminatorFn,
- tiledBodyFn);
+ rewriter, loc, loopRanges, tileSizes, numThreads, mappingVector,
+ destinationTensors, tiledBodyFn, loops);
}
return rewriter.notifyMatchFailure(loc, "unhandled loop type");
}
@@ -724,7 +604,7 @@ static FailureOr<SmallVector<LoopLikeOpInterface>> generateLoopNest(
static FailureOr<SmallVector<Value>> createInitialTensorsForTiling(
RewriterBase &rewriter, TilingInterface op,
ReductionTilingStrategy reductionStrategy, ArrayRef<Range> iterationDomain,
- ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> givenTileSizes,
+ ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,
const SetVector<unsigned> &reductionDims) {
SmallVector<Value> initTensors;
Location loc = op->getLoc();
@@ -746,7 +626,7 @@ static FailureOr<SmallVector<Value>> createInitialTensorsForTiling(
AffineExpr sizeExpr = ((s0 - s1).ceilDiv(s2));
AffineExpr divExpr = s0.ceilDiv(s1);
for (auto [index, domain, tileSize] :
- llvm::enumerate(iterationDomain, givenTileSizes)) {
+ llvm::enumerate(iterationDomain, tileSizes)) {
if (!numThreads.empty()) {
// Untiled case.
if (isConstantIntValue(numThreads[index], 0)) {
@@ -792,7 +672,7 @@ static SmallVector<OpFoldResult>
getSplitReductionIvs(RewriterBase &rewriter, Location loc,
ReductionTilingStrategy reductionStrategy, ValueRange ivs,
ArrayRef<OpFoldResult> numThreads,
- ArrayRef<OpFoldResult> givenTileSizes,
+ ArrayRef<OpFoldResult> tileSizes,
const SetVector<unsigned> &reductionDims) {
SmallVector<OpFoldResult> splitReductionIvs;
splitReductionIvs.resize(reductionDims.size(), rewriter.getIndexAttr(0));
@@ -809,7 +689,7 @@ getSplitReductionIvs(RewriterBase &rewriter, Location loc,
}
splitReductionIvs[index] = affine::makeComposedFoldedAffineApply(
rewriter, loc, divExpr,
- ArrayRef<OpFoldResult>{ivs[ivIndex++], givenTileSizes[reductionDim]});
+ ArrayRef<OpFoldResult>{ivs[ivIndex++], tileSizes[reductionDim]});
}
}
return splitReductionIvs;
@@ -821,7 +701,7 @@ getTiledImplementation(RewriterBase &rewriter, TilingInterface op,
ValueRange regionIterArg, ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes, ValueRange ivs,
ArrayRef<OpFoldResult> numThreads,
- ArrayRef<OpFoldResult> givenTileSizes,
+ ArrayRef<OpFoldResult> tileSizes,
const SetVector<unsigned> &reductionDims) {
if (reductionStrategy == ReductionTilingStrategy::FullReduction) {
return op.getTiledImplementation(rewriter, offsets, sizes);
@@ -837,7 +717,7 @@ getTiledImplementation(RewriterBase &rewriter, TilingInterface op,
SmallVector<OpFoldResult> splitReductionIvs =
getSplitReductionIvs(rewriter, op.getLoc(), reductionStrategy, ivs,
- numThreads, givenTileSizes, reductionDims);
+ numThreads, tileSizes, reductionDims);
return redOp.tileToPartialReduction(rewriter, op.getLoc(), reductionStrategy,
regionIterArg, offsets, sizes,
reductionDims, splitReductionIvs);
@@ -848,8 +728,7 @@ static LogicalResult getResultTilePosition(
int64_t index, Value tiledResult, TilingInterface op,
ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
ValueRange ivs, ArrayRef<OpFoldResult> numThreads,
- ArrayRef<OpFoldResult> givenTileSizes,
- const SetVector<unsigned> &reductionDims,
+ ArrayRef<OpFoldResult> tileSizes, const SetVector<unsigned> &reductionDims,
SmallVector<OpFoldResult> &resultOffset,
SmallVector<OpFoldResult> &resultSize) {
@@ -865,7 +744,7 @@ static LogicalResult getResultTilePosition(
}
SmallVector<OpFoldResult> splitReductionIvs =
getSplitReductionIvs(rewriter, op.getLoc(), reductionStrategy, ivs,
- numThreads, givenTileSizes, reductionDims);
+ numThreads, tileSizes, reductionDims);
return redOp.getPartialResultTilePosition(
rewriter, index, reductionStrategy, offsets, sizes, reductionDims,
splitReductionIvs, resultOffset, resultSize);
@@ -1120,20 +999,20 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
// 2. Materialize the tile sizes and/or number of threads;
- SmallVector<OpFoldResult> givenTileSizes, numThreads;
- std::tie(givenTileSizes, numThreads) =
+ SmallVector<OpFoldResult> tileSizes, numThreads;
+ std::tie(tileSizes, numThreads) =
getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options);
// Check if it is safe to tile. This is hold over from previous iterations
// of tile to for-all. Consider dropping it.
if (failed(checkTileSizes(op, options.loopType, options.reductionStrategy,
- givenTileSizes, numThreads))) {
+ tileSizes, numThreads))) {
return failure();
}
// Get the reduction dims
SetVector<unsigned> reductionDims =
- getSanitizedReductionDims(givenTileSizes, options);
+ getSanitizedReductionDims(tileSizes, options);
// 3. If there is an interchange specified, permute the iteration domain and
// the tile sizes.
@@ -1145,7 +1024,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
"expected interchange vector to be a permutation");
applyPermutationToVector(iterationDomain, interchangeVector);
- applyPermutationToVector(givenTileSizes, interchangeVector);
+ applyPermutationToVector(tileSizes, interchangeVector);
if (!numThreads.empty())
applyPermutationToVector(numThreads, interchangeVector);
}
@@ -1153,21 +1032,24 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
FailureOr<TilingResult> tilingResult;
// 4. Define the lambda function used later to generate the body of the
// innermost tiled loop.
- GenerateTiledBodyFn innerYieldTiledValuesFn =
+ YieldTiledValuesFn innerYieldTiledValuesFn =
[&](RewriterBase &rewriter, Location loc, ValueRange ivs,
- ArrayRef<OpFoldResult> tileOffsets, ArrayRef<OpFoldResult> tileSizes,
ValueRange regionIterArgs, SmallVector<Value> &tiledResults,
SmallVector<SmallVector<OpFoldResult>> &resultOffsets,
SmallVector<SmallVector<OpFoldResult>> &resultSizes)
-> LogicalResult {
+ // 4a. Compute the `offsets` and `sizes` to use for tiling.
+ SmallVector<OpFoldResult> offsets, sizes;
+ std::tie(offsets, sizes) = getTileOffsetAndSizes(
+ rewriter, loc, options.reductionStrategy, ivs, iterationDomain,
+ tileSizes, numThreads, reductionDims);
+
// 4b. If interchange was provided, apply inverse of the interchange
// to get back the offsets/sizes in the order to be specified.
- SmallVector<OpFoldResult> tileOffsetsVec = llvm::to_vector(tileOffsets);
- SmallVector<OpFoldResult> tileSizesVec = llvm::to_vector(tileSizes);
if (!interchangeVector.empty()) {
auto inversePermutation = invertPermutationVector(interchangeVector);
- applyPermutationToVector(tileOffsetsVec, inversePermutation);
- applyPermutationToVector(tileSizesVec, inversePermutation);
+ applyPermutationToVector(offsets, inversePermutation);
+ applyPermutationToVector(sizes, inversePermutation);
}
// 5. Generate the tiled implementation within the inner most loop.
@@ -1179,7 +1061,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
// 5b. Early return cloned op if tiling is not happening. We can not
// return the original op because it could lead to `rewriter.replaceOp(op,
// op->getResults())` and users would get crash.
- if (llvm::all_of(givenTileSizes, isZeroInteger)) {
+ if (llvm::all_of(tileSizes, isZeroInteger)) {
tiledResults.append(clonedOp->result_begin(), clonedOp->result_end());
tilingResult =
TilingResult{/*tiledOps=*/{clonedOp}, clonedOp->getResults(),
@@ -1188,10 +1070,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
}
// 5c. Tile the cloned operation.
- tilingResult =
- getTiledImplementation(rewriter, clonedOp, options.reductionStrategy,
- regionIterArgs, tileOffsetsVec, tileSizesVec,
- ivs, numThreads, givenTileSizes, reductionDims);
+ tilingResult = getTiledImplementation(
+ rewriter, clonedOp, options.reductionStrategy, regionIterArgs, offsets,
+ sizes, ivs, numThreads, tileSizes, reductionDims);
if (failed(tilingResult)) {
rewriter.eraseOp(clonedOp);
return op.emitOpError("faild to tile operation");
@@ -1208,8 +1089,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
SmallVector<OpFoldResult> resultOffset, resultSize;
if (failed(getResultTilePosition(
rewriter, options.reductionStrategy, index, tiledValue, op,
- tileOffsetsVec, tileSizesVec, ivs, numThreads, givenTileSizes,
- reductionDims, resultOffset, resultSize))) {
+ offsets, sizes, ivs, numThreads, tileSizes, reductionDims,
+ resultOffset, resultSize))) {
for (auto op : tilingResult->tiledOps) {
rewriter.eraseOp(op);
}
@@ -1226,7 +1107,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
// 6. Find the destination tensors to use for the operation.
FailureOr<SmallVector<Value>> maybeInits = createInitialTensorsForTiling(
rewriter, op, options.reductionStrategy, iterationDomain, numThreads,
- givenTileSizes, reductionDims);
+ tileSizes, reductionDims);
if (failed(maybeInits)) {
return rewriter.notifyMatchFailure(
op, "unable to create initial tensors for tiling");
@@ -1235,16 +1116,13 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
// 7. Generate the tiled loops nest using the callback defined above.
SmallVector<LoopLikeOpInterface> loops;
- {
- FailureOr<SmallVector<LoopLikeOpInterface>> loopsOr = generateLoopNest(
- rewriter, op.getLoc(), options, iterationDomain, givenTileSizes,
- numThreads, initTensors, innerYieldTiledValuesFn);
- if (failed(loopsOr))
- return op.emitOpError("failed to generate tiling loops");
- assert(succeeded(tilingResult) &&
- "expected tiling result to be computed after loop generation");
- std::swap(loops, loopsOr.value());
- }
+ if (failed(generateLoopNest(rewriter, op.getLoc(), options.loopType,
+ iterationDomain, tileSizes, numThreads,
+ initTensors, options.mappingVector,
+ innerYieldTiledValuesFn, loops)))
+ return op.emitOpError("failed to generate tiling loops");
+ assert(succeeded(tilingResult) &&
+ "expected tiling result to be computed after loop generation");
if (loops.empty()) {
// If loops are empty, the tiled op is used as the replacement for the
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir
deleted file mode 100644
index d335e9c3fb5d0..0000000000000
--- a/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir
+++ /dev/null
@@ -1,60 +0,0 @@
-// RUN: mlir-opt --transform-interpreter --cse --split-input-file --mlir-print-local-scope %s | FileCheck %s
-
-module {
- func.func @generic_parallel(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>) -> tensor<?x?xf32> {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
- %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
- %empty = tensor.empty(%d0, %d1) : tensor<?x?xf32>
- %generic = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>) outs(%empty : tensor<?x?xf32>) {
- ^bb(%b0 : f32, %b1 : f32, %b2 : f32):
- %add = arith.addf %b0, %b1 : f32
- linalg.yield %add : f32
- } -> tensor<?x?xf32>
- return %generic : tensor<?x?xf32>
- }
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
- %op = transform.structured.match ops {["linalg.generic"]} in %arg1
- : (!transform.any_op) -> !transform.any_op
- %tiled_op, %loop = transform.test.tile_using_custom_loop %op tile_sizes = [10, 20]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.yield
- }
-}
-// CHECK-LABEL: func @generic_parallel
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-// CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[D1]]) : tensor<?x?xf32>
-// CHECK-DAG: %[[NITERS0:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 10)>()[%[[D0]]]
-// CHECK-DAG: %[[NITERS1:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 20)>()[%[[D1]]]
-// CHECK-DAG: %[[NITERS:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 ceildiv 10) * (s1 ceildiv 20))>()[%[[D0]], %[[D1]]]
-// CHECK: %[[FOR:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NITERS]] step %[[C1]]
-// CHECK-SAME: iter_args(%[[INIT:.+]] = %[[EMPTY]])
-// CHECK: %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[IV]] into (%[[NITERS0]], %[[NITERS1]])
-// CHECK-DAG: %[[SIZE0:.+]] = affine.min affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)>(%[[DELINEARIZE]]#0)[%[[D0]]]
-// CHECK-DAG: %[[SIZE1:.+]] = affine.min affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>(%[[DELINEARIZE]]#1)[%[[D1]]]
-// CHECK-DAG: %[[OFFSET0:.+]] = affine.apply affine_map<(d0) -> (d0 * 10)>(%[[DELINEARIZE]]#0)
-// CHECK-DAG: %[[OFFSET1:.+]] = affine.apply affine_map<(d0) -> (d0 * 20)>(%[[DELINEARIZE]]#1)
-// CHECK-DAG: %[[ARG0_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1]
-// CHECK-DAG: %[[ARG1_SLICE:.+]] = tensor.extract_slice %[[ARG1]][%[[OFFSET1]]] [%[[SIZE1]]] [1]
-// CHECK-DAG: %[[INIT_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1]
-// CHECK: %[[GENERIC:.+]] = linalg.generic
-// CHECK-SAME: ins(%[[ARG0_SLICE]], %[[ARG1_SLICE]] :
-// CHECK-SAME: outs(%[[INIT_SLICE]] :
-// CHECK: %[[INSERT_SLICE:.+]] = tensor.insert_slice %[[GENERIC]] into %[[INIT]]
-// CHECK-SAME: [%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1]
-// CHECK: scf.yield %[[INSERT_SLICE]]
-// CHECK: return %[[FOR]]
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index 1e3d5371f1ea8..3d24d4ecc4d0d 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -13,7 +13,6 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
@@ -469,153 +468,6 @@ transform::TestTileAndFuseOuterParallelPartialReductionOp::apply(
: DiagnosedSilenceableFailure::success();
}
-//===----------------------------------------------------------------------===//
-// TestTileAndFuseOuterParallelPartialReduction
-//===----------------------------------------------------------------------===//
-
-DiagnosedSilenceableFailure transform::TestTileUsingCustomLoopOp::apply(
- TransformRewriter &transformRewriter, TransformResults &transformResults,
- TransformState &state) {
- auto target =
- dyn_cast<TilingInterface>(*state.getPayloadOps(getRootOp()).begin());
- if (!target) {
- emitOpError("expected root operation to implement `TilingInterface`");
- return DiagnosedSilenceableFailure::definiteFailure();
- }
-
- OpFoldResult oneOfr = transformRewriter.getIndexAttr(1);
-
- scf::SCFTilingOptions::GenerateLoopHeaderFn loopHeaderFn =
- [&](RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
- ArrayRef<OpFoldResult> givenTileSizes,
- ValueRange outerDestinationTensors)
- -> FailureOr<scf::SCFTilingOptions::CustomLoopHeaderInfo> {
- // Check that the strides are all 1 (to make it easier in the test).
- if (llvm::any_of(loopRanges, [](Range r) {
- return !isConstantIntValue(r.stride, 1);
- })) {
- return emitOpError("unable to handle loop ranges with strides != 1");
- }
- // For testing disallow any of the tile sizes being 0.
- if (llvm::any_of(givenTileSizes, isZeroInteger)) {
- return emitOpError("unhandled case of zero tile size");
- }
- // For testing, only handle tensor tiling.
- if (outerDestinationTensors.empty()) {
- return emitOpError("expected destination tensors");
- }
-
- // Compute the number of iterations for each of the loops.
- AffineExpr s0, s1, s2;
- bindSymbols(rewriter.getContext(), s0, s1, s2);
- AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); // (ub - lb) / tileSize
-
- SmallVector<OpFoldResult> allNumIters;
- allNumIters.reserve(loopRanges.size());
- for (auto [loopRange, tileSize] :
- llvm::zip_equal(loopRanges, givenTileSizes)) {
- OpFoldResult numIters = affine::makeComposedFoldedAffineApply(
- rewriter, loc, numItersExpr,
- {loopRange.offset, loopRange.size, tileSize});
- allNumIters.push_back(numIters);
- }
- if (allNumIters.empty()) {
- return emitOpError("unhandled case where all tile sizes are zero");
- }
-
- AffineExpr mulExpr = s0 * s1;
- OpFoldResult cummulative = oneOfr;
- for (auto numIters : allNumIters) {
- cummulative = affine::makeComposedFoldedAffineApply(
- rewriter, loc, mulExpr, {cummulative, numIters});
- }
-
- Value zeroVal = arith::ConstantIndexOp::create(rewriter, loc, 0);
- Value oneVal = arith::ConstantIndexOp::create(rewriter, loc, 1);
- Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, cummulative);
-
- SmallVector<OpFoldResult> offsets;
- SmallVector<OpFoldResult> sizes;
- SmallVector<Value> innerDestinationTensors;
- offsets.reserve(loopRanges.size());
- sizes.reserve(loopRanges.size());
-
- AffineExpr d0;
- bindDims(rewriter.getContext(), d0);
- AffineExpr offsetExpr = s0 + d0 * s1; // lb + iv * tileSize
- AffineMap minMap =
- AffineMap::get(1, 2, {s0 - d0, s1},
- rewriter.getContext()); // min(ub - offset, tileSize)
- auto forOp = scf::ForOp::create(
- rewriter, loc, zeroVal, ub, oneVal, outerDestinationTensors,
- [&](OpBuilder &b, Location bodyLoc, Value linearizedIv,
- ValueRange destinations) {
- auto delinearizeOp = affine::AffineDelinearizeIndexOp::create(
- b, bodyLoc, linearizedIv, allNumIters);
- for (auto [normalizedIv, range, tileSize] : llvm::zip_equal(
- delinearizeOp.getResults(), loopRanges, givenTileSizes)) {
-
- OpFoldResult normalizedIvOfr = getAsOpFoldResult(normalizedIv);
- OpFoldResult offset = affine::makeComposedFoldedAffineApply(
- b, bodyLoc, offsetExpr,
- {normalizedIvOfr, range.offset, tileSize});
- offsets.push_back(offset);
-
- OpFoldResult size = affine::makeComposedFoldedAffineMin(
- b, bodyLoc, minMap, {offset, range.size, tileSize});
- sizes.push_back(size);
- }
- innerDestinationTensors = llvm::to_vector(destinations);
- });
- rewriter.setInsertionPointToEnd(forOp.getBody());
- return scf::SCFTilingOptions::CustomLoopHeaderInfo{
- {cast<LoopLikeOpInterface>(forOp.getOperation())},
- offsets,
- sizes,
- innerDestinationTensors};
- };
-
- scf::SCFTilingOptions::GenerateLoopTerminatorFn terminatorFn =
- [&](RewriterBase &rewriter, Location loc, ValueRange tiledResults,
- ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
- ArrayRef<SmallVector<OpFoldResult>> resultSizes,
- ValueRange destinationTensors) -> LogicalResult {
- SmallVector<Value> yieldValues;
- yieldValues.reserve(destinationTensors.size());
- for (auto [tiledResult, offsets, sizes, destination] : llvm::zip_equal(
- tiledResults, resultOffsets, resultSizes, destinationTensors)) {
- SmallVector<OpFoldResult> strides(offsets.size(), oneOfr);
- Value insertedVal = tensor::InsertSliceOp::create(
- rewriter, loc, tiledResult, destination, offsets, sizes, strides);
- yieldValues.push_back(insertedVal);
- }
- scf::YieldOp::create(rewriter, loc, yieldValues);
- return success();
- };
-
- scf::SCFTilingOptions tilingOptions;
- SmallVector<int64_t> staticTileSizes =
- extractFromIntegerArrayAttr<int64_t>(getTileSizes());
- SmallVector<OpFoldResult> tileSizes =
- getAsIndexOpFoldResult(transformRewriter.getContext(), staticTileSizes);
- tilingOptions.setTileSizes(tileSizes)
- .setLoopType(scf::SCFTilingOptions::LoopType::CustomOp)
- .setCustomLoopGenerationFns(loopHeaderFn, terminatorFn);
-
- OpBuilder::InsertionGuard g(transformRewriter);
- transformRewriter.setInsertionPoint(target);
- FailureOr<scf::SCFTilingResult> tiledResults =
- scf::tileUsingSCF(transformRewriter, target, tilingOptions);
- if (failed(tiledResults)) {
- return DiagnosedSilenceableFailure::definiteFailure();
- }
- transformRewriter.replaceOp(target, tiledResults->replacements);
- transformResults.set(getOperation()->getResult(0), tiledResults->tiledOps);
- transformResults.set(getOperation()->getResult(1), tiledResults->loops);
-
- return DiagnosedSilenceableFailure::success();
-}
-
#define GET_OP_CLASSES
#include "TestTilingInterfaceTransformOps.cpp.inc"
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
index 694c4229eef62..58ccd30bb99a2 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
@@ -150,27 +150,4 @@ def TestTileAndFuseOuterParallelPartialReductionOp : Op<
}];
}
-def TestTileUsingCustomLoopOp : Op<
- Transform_Dialect, "test.tile_using_custom_loop",
- [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
- DeclareOpInterfaceMethods<TransformOpInterface>,
- ReportTrackingListenerFailuresOpTrait]> {
- let description = [{
- Test Transform op to tile an operation using custom loops.
-
- The test just folds all the loops and into a single loop and then
- delinearizes the indices.
- }];
-
- let arguments = (ins TransformHandleTypeInterface:$root_op,
- DefaultValuedAttr<I64ArrayAttr, "{}">:$tile_sizes);
- let results = (outs TransformHandleTypeInterface:$tiled_ops,
- Variadic<TransformHandleTypeInterface>:$loops);
-
- let assemblyFormat = [{
- $root_op `tile_sizes` `=` $tile_sizes
- attr-dict `:` functional-type(operands, results)
- }];
-}
-
#endif // TEST_TILINGINTERFACE_TRANSFORM_OPS
More information about the llvm-branch-commits
mailing list