[Mlir-commits] [mlir] [MLIR] Add continuous tiling to TileUsingForOp (PR #82792)

Fri Feb 23 08:57:35 PST 2024

https://github.com/muneebkhan85 created https://github.com/llvm/llvm-project/pull/82792

This patch adds continuous tiling options to TileUsingForOp. In cases where the tensor dimensions are not exactly divisible by the tile size, we are left with leftover tensor chunks that are irregularly tiled. This approach attempts to tile the leftover chunk with a smaller tile size and repeats this process recursively using exponentially diminishing tile sizes. The transform eventually generates a chain of loops that apply tiling using diminishing tile sizes. The transform lowers from the linalg dialect to scf dialect.

>From 4c89934000a76923d4caedcae5734d4aa8fc99d2 Mon Sep 17 00:00:00 2001
From: Muneeb Khan <muneeb.khan at huawei.com>
Date: Sat, 24 Feb 2024 00:49:18 +0800
Subject: [PATCH] [MLIR] Add continuous tiling to TileUsingForOp

This patch adds continuous tiling options to TileUsingForOp.
In cases where the tensor dimensions are not exactly divisible
by the tile size, we are left with leftover tensor chunks that
are irregularly tiled. This approach attempts to tile the leftover
chunk with a smaller tile size and repeats this process recursively
using exponentially diminishing tile sizes. The transform eventually
generates a chain of loops that apply tiling using diminishing tile
sizes. The transform lowers from the linalg dialect to scf dialect.
---
 .../Linalg/TransformOps/LinalgTransformOps.td |   9 +-
 .../SCF/Transforms/TileUsingInterface.h       |  14 +
 .../TransformOps/LinalgTransformOps.cpp       |  40 +-
 .../SCF/Transforms/TileUsingInterface.cpp     | 457 ++++++++++++++++++
 .../mlir/dialects/transform/structured.py     |   4 +
 .../Dialect/Linalg/continuous-tiling.mlir     | 390 +++++++++++++++
 6 files changed, 905 insertions(+), 9 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/continuous-tiling.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a562872f..4976c8e82db4df 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1833,7 +1833,9 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
     be as many handles as `ShapedType::kDynamic` values in the
     `static_sizes` attribute. A static size of `0` indicates that the dimension
     should not be tiled. No loop will be generated for such dimensions. If all
-    tile sizes are `0`, this transform is effectively a no-op.
+    tile sizes are `0`, this transform is effectively a no-op. To apply
+    continuous tiling `continuous_tiles` needs to be supplied with as many
+    boolean values as there are nested loops.
 
     This op returns handles to the tiled op (in the generated loop nest) and the
     generated loops. The number of loops is the number of tile sizes that are
@@ -1859,6 +1861,7 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
   let arguments = (ins TransformHandleTypeInterface:$target,
                    Variadic<TransformAnyParamTypeOrAnyHandle>:$dynamic_sizes,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sizes,
+                   DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$continuous_tiles,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$interchange,
                    DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$scalable_sizes);
   let results = (outs TransformHandleTypeInterface:$tiled_linalg_op,
@@ -1867,22 +1870,26 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
     OpBuilder<(ins "TypeRange":$loopTypes,
                    "Value":$target,
                    "ArrayRef<int64_t>":$staticTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "TypeRange":$loopTypes,
                    "Value":$target,
                    "ArrayRef<OpFoldResult>":$mixedTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$staticTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<OpFoldResult>":$mixedTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 965ef9e203be28..b40291b5a80da5 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -71,6 +71,14 @@ struct SCFTilingOptions {
         mapping, [](auto attr) -> Attribute { return attr; });
     return *this;
   }
+
+  /// Specify which loops in the loop nest are to be continuously tiled.
+  SmallVector<bool> continuousTileMappingVector = {};
+  SCFTilingOptions &setCTileMapping(ArrayRef<bool> ctile) {
+    continuousTileMappingVector =
+        llvm::map_to_vector(ctile, [](auto attr) -> bool { return attr; });
+    return *this;
+  }
 };
 
 /// Transformation information returned after tiling.
@@ -92,6 +100,12 @@ FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
                                         TilingInterface op,
                                         const SCFTilingOptions &options);
 
+/// Method to continuously tile an op that implements the `TilingInterface`
+/// using `scf.for` for iterating over the tiles.
+FailureOr<SCFTilingResult>
+continuousTileUsingSCF(RewriterBase &rewriter, TilingInterface op,
+                            const SCFTilingOptions &options);
+
 /// Options used to control tile + fuse.
 struct SCFTileAndFuseOptions {
   /// The tiling options used to control the tiling of the consumer.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 299965bcfc3ab3..75080ea7721beb 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2476,39 +2476,41 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, TypeRange loopTypes,
     Value target, ArrayRef<int64_t> staticTileSizes,
-    ArrayRef<int64_t> interchange,
+    ArrayRef<bool> continuousTiles, ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   return build(builder, result, loopTypes,
                /*target=*/target,
                /*mixedTileSizes=*/
                getAsOpFoldResult(builder.getI64ArrayAttr(staticTileSizes)),
-               interchange, scalableSizes);
+               continuousTiles, interchange, scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<int64_t> staticTileSizes, ArrayRef<int64_t> interchange,
+    ArrayRef<int64_t> staticTileSizes, ArrayRef<bool> continuousTiles,
+    ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   build(builder, result, target,
         getAsOpFoldResult(builder.getI64ArrayAttr(staticTileSizes)),
-        interchange, scalableSizes);
+        builder.getDenseBoolArrayAttr(continuousTiles), interchange, scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedTileSizes, ArrayRef<int64_t> interchange,
+    ArrayRef<OpFoldResult> mixedTileSizes, ArrayRef<bool> continuousTiles,
+    ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   // Loop types are automaticaly splat by the callee, setting up one is
   // enough.
   SmallVector<Type> loopTypes(1, builder.getType<transform::AnyOpType>());
-  build(builder, result, loopTypes, target, mixedTileSizes, interchange,
+  build(builder, result, loopTypes, target, mixedTileSizes, continuousTiles, interchange,
         scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, TypeRange loopTypes,
     Value target, ArrayRef<OpFoldResult> mixedTileSizes,
-    ArrayRef<int64_t> interchange,
+    ArrayRef<bool> continuousTiles, ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   SmallVector<int64_t> staticTileSizes;
   SmallVector<Value> dynamicTileSizes;
@@ -2517,6 +2519,7 @@ void transform::TileUsingForOp::build(
   // attributes for multiple variadic operands. In the absence of this,
   // horrible bugs ensue.
   auto staticTileSizesAttr = builder.getDenseI64ArrayAttr(staticTileSizes);
+  auto continuousTilesAttr = builder.getDenseBoolArrayAttr(continuousTiles);
   unsigned numExpectedLoops =
       staticTileSizes.size() - llvm::count(staticTileSizes, 0);
   SmallVector<Type> resultTypes;
@@ -2535,6 +2538,7 @@ void transform::TileUsingForOp::build(
         /*target=*/target,
         /*dynamic_sizes=*/dynamicTileSizes,
         /*static_sizes=*/staticTileSizesAttr,
+        /*continuous_tiles=*/continuousTilesAttr,
         /*interchange=*/builder.getDenseI64ArrayAttr(interchange),
         /*scalable_sizes=*/expandedScalableSizes);
 }
@@ -2675,8 +2679,15 @@ transform::TileUsingForOp::apply(transform::TransformRewriter &rewriter,
     }
 
     tilingOptions.setInterchange(getInterchange());
-    FailureOr<scf::SCFTilingResult> maybeTilingResult =
+    tilingOptions.setCTileMapping(getContinuousTiles());
+
+    FailureOr<scf::SCFTilingResult> maybeTilingResult;
+    if (tilingOptions.continuousTileMappingVector.empty())
+      maybeTilingResult =
         tileUsingSCF(rewriter, tilingInterface, tilingOptions);
+    else
+      maybeTilingResult =
+          continuousTileUsingSCF(rewriter, tilingInterface, tilingOptions);
     if (failed(maybeTilingResult))
       return DiagnosedSilenceableFailure::definiteFailure();
 
@@ -2725,6 +2736,18 @@ ParseResult parseOptionalInterchange(OpAsmParser &parser,
   return success();
 }
 
+ParseResult parseOptionalContinuousTiles(OpAsmParser &parser,
+                                         OperationState &result) {
+  if (failed(parser.parseOptionalKeyword("continuous_tiles")))
+    return success();
+  if (failed(parser.parseEqual()))
+    return failure();
+  result.addAttribute(
+      transform::TileUsingForOp::getContinuousTilesAttrName(result.name),
+      DenseBoolArrayAttr::parse(parser, Type{}));
+  return success();
+}
+
 void printOptionalInterchange(OpAsmPrinter &p,
                               ArrayRef<int64_t> interchangeVals) {
   if (!interchangeVals.empty()) {
@@ -2747,6 +2770,7 @@ ParseResult transform::TileUsingForOp::parse(OpAsmParser &parser,
   if (parser.parseOperand(target) || parser.getCurrentLocation(&operandLoc) ||
       parseDynamicIndexList(parser, dynamicSizes, staticSizes, scalableVals) ||
       parseOptionalInterchange(parser, result) ||
+      parseOptionalContinuousTiles(parser, result) ||
       parser.parseOptionalAttrDict(result.attributes) ||
       parser.parseColonType(functionalType))
     return ParseResult::failure();
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 1a84a59ddb69df..f81a901c82db99 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -31,6 +31,8 @@
 
 using namespace mlir;
 
+static constexpr char kLoopIndexLabel[] = "__loop_index__";
+
 scf::SCFTilingOptions &
 scf::SCFTilingOptions::setTileSizes(ArrayRef<OpFoldResult> ts) {
   assert(!tileSizeComputationFunction && "tile sizes already set");
@@ -309,6 +311,189 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
   return rewriter.notifyMatchFailure(loc, "unhandled loop type");
 }
 
+static void continuousLoopNestHelper(
+    OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+    SmallVector<LoopLikeOpInterface> &loops, uint64_t loopLevelIdx, uint64_t &loopIdx,
+    ArrayRef<OpFoldResult> tileSizes, SmallVector<bool> &CTileVector,
+    std::map<int, OpFoldResult> &sizesMap,
+    SmallVector<scf::ForOp> &innermostLoops, ValueRange destinationTensors = {},
+    bool isHeadOrInsideHeadLoop = false) {
+
+  Value offset = getValueOrCreateConstantIndexOp(
+      builder, loc, loopRanges[loopLevelIdx].offset);
+  Value size = getValueOrCreateConstantIndexOp(builder, loc,
+                                               loopRanges[loopLevelIdx].size);
+  Value tileSize =
+      getValueOrCreateConstantIndexOp(builder, loc, tileSizes[loopLevelIdx]);
+
+  AffineExpr sym0, sym1, sym2;
+  bindSymbols(builder.getContext(), sym0, sym1, sym2);
+  AffineMap defaultSplitMap =
+      AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
+  // Simplified map for use when step is power of 2 and lower bound
+  // is exactly divisble by step.
+  AffineMap powerSplitMap = AffineMap::get(0, 3, {sym1 - (sym1 % sym2)});
+
+  uint64_t tileSizeInt = *getConstantIntValue(tileSize);
+
+  // Enforce no tiling when tile size is zero.
+  // No need to create a loop here.
+  if (tileSizeInt == 0) {
+    continuousLoopNestHelper(builder, loc, loopRanges, loops, loopLevelIdx + 1,
+                             loopIdx, tileSizes, CTileVector, sizesMap,
+                             innermostLoops, destinationTensors,
+                             isHeadOrInsideHeadLoop);
+    return;
+  }
+
+  // The head loop is always tiled using the tile size specified
+  // in the size parameters to tile_using_for transform.
+  auto loop = builder.create<scf::ForOp>(
+      loc, offset, size, tileSize, destinationTensors,
+      [&](OpBuilder &bodyBuilder, Location bodyLoc, Value iv,
+          ValueRange /*iterArgs*/) {
+        sizesMap[loopIdx] =
+            getBoundedTileSize(bodyBuilder, bodyLoc, loopRanges[loopLevelIdx],
+                               iv, getAsOpFoldResult(tileSize));
+      });
+
+  loop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+  ++loopIdx;
+
+  scf::ForOp currentLoop = loop;
+  auto lbInt = getConstantIntValue(currentLoop.getLowerBound());
+  // Use simplified powerSplitMap instead of the default when possible.
+  bool usePowerSplit = (lbInt.has_value()) &&
+                       (*lbInt % tileSizeInt == static_cast<int64_t>(0)) &&
+                       (tileSizeInt == llvm::bit_floor(tileSizeInt));
+
+  AffineMap splitMap = usePowerSplit ? powerSplitMap : defaultSplitMap;
+
+  bool isInnermostLoop = loopLevelIdx == loopRanges.size() - 1;
+  if (isInnermostLoop)
+    innermostLoops.push_back(currentLoop);
+
+  if (isHeadOrInsideHeadLoop)
+    loops.push_back(loop);
+
+  builder.setInsertionPointToEnd(loop.getBody());
+
+  // Create the nested loop inside current loop.
+  if (!isInnermostLoop)
+    continuousLoopNestHelper(builder, loop->getLoc(), loopRanges, loops,
+                             loopLevelIdx + 1, loopIdx, tileSizes, CTileVector,
+                             sizesMap, innermostLoops, loop.getRegionIterArgs(),
+                             isHeadOrInsideHeadLoop);
+
+  // Apply continuous tiling to current loop if continuous_tiles
+  // specifies so.
+  while (CTileVector[loopLevelIdx] && tileSizeInt > 1) {
+
+    uint64_t maxPower = llvm::bit_floor(tileSizeInt);
+    tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
+
+    builder.setInsertionPoint(currentLoop);
+
+    auto constStepOp = builder.create<arith::ConstantIndexOp>(loc, tileSizeInt);
+
+    Value splitBound = builder.createOrFold<affine::AffineApplyOp>(
+        loc, splitMap,
+        ValueRange{currentLoop.getLowerBound(), currentLoop.getUpperBound(),
+                   currentLoop.getStep()});
+
+    builder.setInsertionPointAfter(currentLoop);
+    auto additionalLoop =
+        builder.create<scf::ForOp>(currentLoop->getLoc(), splitBound, size,
+                                   constStepOp, destinationTensors);
+
+    additionalLoop.getInitArgsMutable().assign(currentLoop->getResults());
+    currentLoop.getUpperBoundMutable().assign(splitBound);
+
+    builder.setInsertionPointToStart(additionalLoop.getBody());
+    AffineExpr s0, s1, d0;
+    bindDims(builder.getContext(), d0);
+    bindSymbols(builder.getContext(), s0, s1);
+    AffineMap minMap = AffineMap::get(1, 1, {s0}, builder.getContext());
+    auto additionalLoopAffineMin = affine::makeComposedAffineMin(
+        builder, loc, minMap,
+        SmallVector<OpFoldResult>{splitBound, getAsOpFoldResult(constStepOp),
+                                  size});
+
+    currentLoop = additionalLoop;
+
+    sizesMap[loopIdx] = getAsOpFoldResult(additionalLoopAffineMin);
+
+    // Add custom loop-indexing attribute to each loop op.
+    // Continuous tiling ends up generating many loop nestings and
+    // each loop can be identified with its loop-index attribute.
+    // This is needed later to retrieve the sizes from sizesMap.
+    currentLoop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+
+    ++loopIdx;
+
+    if (isInnermostLoop)
+      innermostLoops.push_back(currentLoop);
+
+    builder.setInsertionPointToEnd(currentLoop.getBody());
+
+    // Create the nested loop inside current loop.
+    if (!isInnermostLoop)
+      continuousLoopNestHelper(builder, currentLoop->getLoc(), loopRanges,
+                               loops, loopLevelIdx + 1, loopIdx, tileSizes,
+                               CTileVector, sizesMap, innermostLoops,
+                               currentLoop.getRegionIterArgs());
+  }
+
+  // Always yield the result of the tail-end loop as this
+  // will have all the processed tiles.
+  if (!isa<func::ReturnOp>(currentLoop->getBlock()->back())) {
+    builder.setInsertionPointToEnd(currentLoop->getBlock());
+    builder.create<scf::YieldOp>(currentLoop.getLoc(),
+                                 currentLoop.getResults());
+  }
+  /// For the outermost loop insert the tail-end loop in front of loops
+  /// structure so that it's results can be used for replacements in the
+  /// function return. This is removed from the head of loops later.
+  else
+    loops.insert(loops.begin(), currentLoop);
+
+  destinationTensors = loop.getRegionIterArgs();
+}
+
+/// Generate an empty loop nest that represents the continuous-tiled loop nest
+/// shell.
+/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
+/// - `tileSizes` is the tile sizes to use in the first tiling attempt. Zero
+/// represent untiled loops.
+/// - In ``sizesMap` return the multi-dimensional size of
+///   the tile processed within the inner most loop.
+/// - `CTileVector` specifies which loop nest should be continuously tiled.
+/// Note that this methods adds `scf.yield` operation for all but the innermost
+/// loop. These yield the value returned by the immediately inner tail-end loop.
+/// The caller is expected to add the scf.yield operation for all innermost
+/// loops.
+static SmallVector<LoopLikeOpInterface> generateContinuousTileLoopNest(
+    OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+    ArrayRef<OpFoldResult> tileSizes, SmallVector<bool> CTileVector,
+    std::map<int, OpFoldResult> &sizesMap,
+    SmallVector<scf::ForOp> &innermostLoops,
+    ValueRange destinationTensors = {}) {
+  if (loopRanges.empty())
+    return {};
+  assert(loopRanges.size() == tileSizes.size() &&
+         "expected as many tile sizes as loop ranges");
+  OpBuilder::InsertionGuard guard(builder);
+  SmallVector<LoopLikeOpInterface> loops;
+
+  uint64_t loopIdx = 0;
+  continuousLoopNestHelper(builder, loc, loopRanges, loops, 0, loopIdx,
+                           tileSizes, CTileVector, sizesMap, innermostLoops,
+                           destinationTensors, true);
+
+  return loops;
+}
+
+
 /// Append the specified additional `newInitOperands` operands to the
 /// loops existing `init` operands (or similar), and replace `loopOp` with
 /// the new loop that has the additional init operands. The loop body of
@@ -679,6 +864,278 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   return scf::SCFTilingResult{tilingResult->tiledOps, loops, replacements};
 }
 
+/// Implementation of continuous-tiling transformation of `op`
+/// that implements the `TilingInterface` using a sequence of
+/// `scf.for` loops to iterate over tiles of exponentially
+/// diminishing sizes.
+///
+/// The generated sequence of `scf.for` loops iterate over tiles of
+/// exponentially diminishing sizes as opposed to vanilla tiling scheme where
+/// only the tile sizes specified as transform parameters are used.
+/// continuous-tiling first applies regular tiling using the specified
+/// tile size, then halves the tile size and uses it to tile the leftover
+/// chunk. This process of halving the tile size and tiling the leftover
+/// chunk is repeated until tile size reaches 1. The transform parameter
+/// continuous_tiles controls which nested loop should be tiled. If all
+/// arguments in continuous_tiles are set to false, the result is identical
+/// to vanilla tiling transform.
+///
+/// When tiling a tensor of size M with tile size 8, it generates
+/// the following loop to tile
+///
+/// for (int i = 0; i < M; i += 8) {
+///   int size = min(8, M-i);
+///   // size is unknown at compile time because M is dynamic
+///   // compute using dynamic size (not optimal)
+/// }
+///
+/// In case of continuous tiling the above loop is converted to a chain of
+/// loops that attempt to tile the cases where (M-i < 8) using smaller
+/// tile sizes, as follows
+///
+/// // use tile size of 8 in the head loop as originally used
+/// for (int i = 0; i != (M - M % 8); i += 8) {
+///   int size = 8;
+///   // size is constant the tail is moved to the next loop
+///   // compute using static size (optimal)
+/// }
+/// // use halved tile size 4 to tile the remaining chunk
+/// for (int i = (M - M % 8); i != (M - M % 4); i += 4)
+///   int size = 4;
+/// // use halved tile size 2 to tile the remaining chunk
+/// for (int i = (M - M % 4); i != (M - M % 2); i += 2)
+///   int size = 2;
+/// // use halved tile size 1 to tile the remaining chunk
+/// for (int i = (M - M % 2); i != M; i += 1)
+///   int size = 1;
+///
+/// All tail-loops can be converted to IFs as they execute at-most once.
+///
+FailureOr<scf::SCFTilingResult>
+mlir::scf::continuousTileUsingSCF(RewriterBase &rewriter,
+                                       TilingInterface op,
+                                       const scf::SCFTilingOptions &options) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointAfter(op);
+
+  std::map<int, OpFoldResult> sizesMap;
+
+  if (!options.tileSizeComputationFunction) {
+    return rewriter.notifyMatchFailure(
+        op, "missing tile size computation function");
+  }
+
+  // 1. Get the range of the loops that are represented by the operation.
+  SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
+  size_t numLoops = iterationDomain.size();
+  if (numLoops == 0) {
+    return rewriter.notifyMatchFailure(
+        op, "unable to tile op with no iteration domain");
+  }
+  // 2. Materialize the tile sizes. Enforce the convention that "tiling by zero"
+  // skips tiling a particular dimension. This convention is significantly
+  // simpler to handle instead of adjusting affine maps to account for missing
+  // dimensions.
+  SmallVector<OpFoldResult> tileSizeVector =
+      options.tileSizeComputationFunction(rewriter, op);
+  if (tileSizeVector.size() < iterationDomain.size()) {
+    auto zero = rewriter.getIndexAttr(0);
+    tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
+  }
+
+  // 3. Find the destination tensors to use for the operation.
+  SmallVector<Value> destinationTensors;
+  if (failed(tensor::getOrCreateDestinations(rewriter, op.getLoc(), op,
+                                             destinationTensors))) {
+    return rewriter.notifyMatchFailure(op,
+                                       "unable to create destination tensors");
+  }
+
+  SmallVector<OpFoldResult> offsets, sizes;
+  SmallVector<LoopLikeOpInterface> forLoops;
+  // Several innermost loops are created when applying continuous tiling.
+  // All innermost loops need to be processed in an identical manner.
+  SmallVector<scf::ForOp> innermostLoops;
+  {
+    // If there is an interchange specified, permute the iteration domain and
+    // the tile sizes.
+    SmallVector<int64_t> interchangeVector;
+    if (!options.interchangeVector.empty()) {
+      interchangeVector = fillInterchangeVector(options.interchangeVector,
+                                                iterationDomain.size());
+
+      if (!isPermutationVector(interchangeVector)) {
+        return rewriter.notifyMatchFailure(
+            op, "invalid intechange vector, not a permutation of the entire "
+                "iteration space");
+      }
+
+      applyPermutationToVector(iterationDomain, interchangeVector);
+      applyPermutationToVector(tileSizeVector, interchangeVector);
+    }
+
+    // Specify which loop nesting should continuously tiled.
+    SmallVector<bool> cTileVector = options.continuousTileMappingVector;
+
+    if (cTileVector.size() < iterationDomain.size())
+      cTileVector.append(numLoops - cTileVector.size(), false);
+
+    offsets.resize(iterationDomain.size());
+    sizes.resize(iterationDomain.size());
+
+    // 4. Materialize an empty loop nest that iterates over the tiles. These
+    // loops for now do not return any values even if the original operation has
+    // results.
+    forLoops = generateContinuousTileLoopNest(
+        rewriter, op.getLoc(), iterationDomain, tileSizeVector, cTileVector,
+        sizesMap, innermostLoops, destinationTensors);
+  }
+
+  LLVM_DEBUG({
+    if (!forLoops.empty()) {
+      llvm::dbgs() << "LoopNest shell :\n";
+      forLoops.front()->getBlock()->dump();
+      llvm::dbgs() << "\n";
+    }
+  });
+
+  TilingResult origTiledImplementation;
+
+  // 5) Generate the tiled implementation within all inner-most loops
+  // and 6) yield results for all inner-most loops.
+  for (scf::ForOp &innermostLoop : innermostLoops) {
+
+    // 5. Generate the tiled implementation within the inner most loop.
+    SmallVector<Value> clonedOpDestination = destinationTensors;
+    if (!forLoops.empty()) {
+      rewriter.setInsertionPointToEnd(innermostLoop.getBody());
+      clonedOpDestination =
+          llvm::map_to_vector(innermostLoop.getRegionIterArgs(),
+                              [](BlockArgument b) -> Value { return b; });
+    }
+
+    // 5a. Clone the operation within the loop body.
+    auto clonedOp = cast<TilingInterface>(
+        cloneOpAndUpdateDestinationArgs(rewriter, op, clonedOpDestination));
+
+    // Compute offsets and sizes for each loop nest combination separately.
+    int loopLevelIdx = iterationDomain.size() - 1;
+    Operation *loopOp = innermostLoop.getOperation();
+
+    // Compute the correct offsets and sizes for each loop nesting (note
+    // that there are possibly several different loop nestings with continuous
+    // tiling enabled) before supplying them to `getTiledImplementation`.
+    while (loopOp) {
+
+      Value tileSize = getValueOrCreateConstantIndexOp(
+          rewriter, op.getLoc(), tileSizeVector[loopLevelIdx]);
+
+      // No need to compute new offsets and sizes when tile size is zero.
+      if (matchPattern(tileSize, m_Zero())) {
+        offsets[loopLevelIdx] = iterationDomain[loopLevelIdx].offset;
+        sizes[loopLevelIdx] = iterationDomain[loopLevelIdx].size;
+        --loopLevelIdx;
+        continue;
+      }
+
+      Value offset = dyn_cast<scf::ForOp>(loopOp).getInductionVar();
+      offsets[loopLevelIdx] = offset;
+
+      // Get the loop index for retreiving sizes info from sizeMap
+      // using the loop attribute `kLoopIndexLabel`.
+      uint64_t currLoopIdxLabel =
+          loopOp->getAttrOfType<IntegerAttr>(kLoopIndexLabel).getInt();
+      sizes[loopLevelIdx] = sizesMap[currLoopIdxLabel];
+
+      loopOp = loopOp->getParentOfType<scf::ForOp>();
+      --loopLevelIdx;
+    }
+
+    // 5b. Early return cloned op if tiling is not happening. We can not return
+    // the original op because it could lead to
+    // `rewriter.replaceOp(op, op->getResults())` and user would get crash.
+    if (llvm::all_of(tileSizeVector, isZeroIndex)) {
+      return scf::SCFTilingResult{/*tiledOps=*/{clonedOp}, /*loops=*/{},
+                                  clonedOp->getResults()};
+    }
+
+    // 5c. Tile the cloned operation.
+    FailureOr<TilingResult> tiledImplementation =
+        clonedOp.getTiledImplementation(rewriter, offsets, sizes);
+    if (failed(tiledImplementation)) {
+      return rewriter.notifyMatchFailure(op, "failed to tile operation");
+    }
+
+    origTiledImplementation.tiledOps.append(tiledImplementation->tiledOps);
+
+    // 5d. Delete the cloned operation.
+    rewriter.eraseOp(clonedOp);
+
+    // If loops are empty, the tiled op is used as the replacement for the
+    // untiled op.
+    if (forLoops.empty()) {
+      return scf::SCFTilingResult{tiledImplementation->tiledOps,
+                                  forLoops,
+                                  tiledImplementation->tiledValues};
+    }
+
+    if (op->getNumResults() == 0) {
+      // The innermost loop does not have a `scf.yield` yet. There is nothing to
+      // return, so generate an empty `scf.yield` operation.
+      rewriter.setInsertionPointToEnd(innermostLoop.getBody());
+      rewriter.create<scf::YieldOp>(op->getLoc());
+    }
+
+    // 6. Yield all the results of the tiled operation.
+    int64_t numResults = op->getNumResults();
+    SmallVector<SmallVector<OpFoldResult>> resultOffsetsList(numResults),
+        resultSizesList(numResults);
+    SmallVector<Value> yieldedValues;
+    for (auto [index, tiledValue] :
+         llvm::enumerate(tiledImplementation->tiledValues)) {
+      SmallVector<OpFoldResult> resultOffsets, resultSizes;
+      if (failed(op.getResultTilePosition(rewriter, index, offsets, sizes,
+                                          resultOffsets, resultSizes))) {
+        return rewriter.notifyMatchFailure(
+            op, "failed to get slice of result produced");
+      }
+      SmallVector<OpFoldResult> resultStrides(resultOffsets.size(),
+                                              rewriter.getIndexAttr(1));
+      auto insertSlice = rewriter.create<tensor::InsertSliceOp>(
+          op->getLoc(), tiledValue, clonedOpDestination[index], resultOffsets,
+          resultSizes, resultStrides);
+      yieldedValues.push_back(insertSlice);
+    }
+    rewriter.create<scf::YieldOp>(op->getLoc(), yieldedValues);
+  }
+
+  SmallVector<Value> replacements = llvm::map_to_vector(
+      forLoops.front()->getResults(), [](OpResult r) -> Value { return r; });
+  LLVM_DEBUG({
+    if (!forLoops.empty()) {
+      llvm::dbgs() << "After tiled implementation :\n";
+      forLoops.front().dump();
+      llvm::dbgs() << "\n";
+    }
+  });
+
+  /// Remove outermost tailend loop as its only use was to compute
+  /// replacements.
+  forLoops.erase(forLoops.begin());
+
+  Block *pBlock = forLoops.front()->getBlock();
+
+  /// Remove custom loop-indexing attribute from all loops.
+  pBlock->walk([](ForOp loopOp) {
+    if (loopOp->hasAttr(kLoopIndexLabel))
+      loopOp->removeAttr(kLoopIndexLabel);
+  });
+
+  return scf::SCFTilingResult{
+      origTiledImplementation.tiledOps, forLoops,
+      op->getNumResults() == 0 ? SmallVector<Value>({}) : replacements};
+}
+
 FailureOr<scf::SCFReductionTilingResult>
 mlir::scf::tileReductionUsingScf(RewriterBase &b,
                                  PartialReductionOpInterface op,
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index d7b41c0bd2207d..9a372ee7eaa53e 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -456,6 +456,7 @@ def __init__(
         target: Union[Operation, Value],
         *,
         sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        continuous_tiles: OptionalBoolList = None,
         interchange: OptionalIntList = None,
         loc=None,
         ip=None,
@@ -468,6 +469,7 @@ def __init__(
         target: Union[Operation, Value, OpView],
         *,
         sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        continuous_tiles: OptionalBoolList = None,
         interchange: OptionalIntList = None,
         loc=None,
         ip=None,
@@ -480,6 +482,7 @@ def __init__(
         target_or_none: Optional[Union[Operation, Value, OpView]] = None,
         *,
         sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        continuous_tiles: OptionalBoolList = None,
         interchange: OptionalIntList = None,
         loc=None,
         ip=None,
@@ -514,6 +517,7 @@ def __init__(
             static_sizes=static_sizes,
             interchange=interchange,
             scalable_sizes=scalable_sizes,
+            continuous_tiles=continuous_tiles,
             loc=loc,
             ip=ip,
         )
diff --git a/mlir/test/Dialect/Linalg/continuous-tiling.mlir b/mlir/test/Dialect/Linalg/continuous-tiling.mlir
new file mode 100644
index 00000000000000..3e8f2afdab0213
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/continuous-tiling.mlir
@@ -0,0 +1,390 @@
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This applies continuous tiling to the innermost loop in linalg.matmul.
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] continuous_tiles=[false, false, true] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (4, -d0 + 130)>
+// CHECK: #{{.*}} = affine_map<(d0) -> (d0 - 1)>
+// CHECK: #[[$MAP2:.+]] = affine_map<() -> (2)>
+// CHECK: #[[$MAP1:.+]] = affine_map<() -> (1)>
+
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+// CHECK:    %[[C0:.+]] = arith.constant 0
+// CHECK:    %[[C130:.+]] = arith.constant 130 : index
+// CHECK:    %[[C4:.+]] = arith.constant 4 : index
+// CHECK:    %[[RES0:.+]] = scf.for %[[IV0:.+]] = %[[C0]] to %[[C130]] step %[[C4]] iter_args(%[[OUTARG:.+]] = %[[OUT]]) -> (tensor<130x130xf32>)
+// CHECK:      %[[AM0:.+]] = affine.min #[[$MAP0]](%[[IV0]])
+// CHECK:      {{.*}} = scf.for %[[IV1:.+]] = %[[C0]]{{.*}} to %[[C130]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTARGI:.+]] = %[[OUTARG]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #[[$MAP0]](%[[IV1]])
+// CHECK:        %[[C2:.+]] = arith.constant 2 : index
+// CHECK:        %[[C128:.+]] = arith.constant 128 : index
+// CHECK:        %[[L2O0:.+]] = scf.for %[[IV2:.+]] = %[[C0]]{{.*}} to %[[C128]] step %[[C4]]{{.*}} iter_args(%[[OUTARG0:.+]] = %[[OUTARGI]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #[[$MAP0]](%[[IV2]])
+// CHECK:          %[[XSIN0:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSIN1:.+]] = tensor.extract_slice %[[IN1]][%[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSOUT:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MMRES0:.+]] = linalg.matmul ins(%[[XSIN0]], %[[XSIN1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XSOUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INSL0:.+]] = tensor.insert_slice %[[MMRES0]] into %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INSL0]] : tensor<130x130xf32>
+// CHECK:        %[[C1:.+]] = arith.constant 1 : index
+// CHECK:        %[[L2O1:.+]] = scf.for %[[IV2]] = %[[C128]] to %[[C130]]{{.*}} step %[[C2]] iter_args(%[[OUTARG1:.+]] = %[[L2O0]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2C2:.+]] = affine.min #[[$MAP2]]()
+// CHECK:          %[[XSIN0:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV2]]] [%[[AM0]], %[[AM2C2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSIN1:.+]] = tensor.extract_slice %[[IN1]][%[[IV2]], %[[IV1]]] [%[[AM2C2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSOUT:.+]] = tensor.extract_slice %[[OUTARG1]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MMRES1:.+]] = linalg.matmul ins(%[[XSIN0]], %[[XSIN1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XSOUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INSL1:.+]] = tensor.insert_slice %[[MMRES1]] into %[[OUTARG1]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INSL1]] : tensor<130x130xf32>
+// CHECK:        %[[RESINMSTTE:.+]] = scf.for %[[IV2]] = %[[C130]]{{.*}}_8 to %[[C130]]{{.*}} step %[[C1]] iter_args(%[[OUTARG2:.+]] = %[[L2O1]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2C1:.+]] = affine.min #[[$MAP1]]()
+// CHECK:          %[[XSIN0:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV2]]] [%[[AM0]], %[[AM2C1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSIN1:.+]] = tensor.extract_slice %[[IN1]][%[[IV2]], %[[IV1]]] [%[[AM2C1]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XSOUT:.+]] = tensor.extract_slice %[[OUTARG2]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MMRES2:.+]] = linalg.matmul ins(%[[XSIN0]], %[[XSIN1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XSOUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INSL2:.+]] = tensor.insert_slice %[[MMRES2]] into %[[OUTARG2]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INSL2]] : tensor<130x130xf32>
+// CHECK:        scf.yield %[[RESINMSTTE]] : tensor<130x130xf32>
+// CHECK:      scf.yield {{.*}} : tensor<130x130xf32>
+// CHECK:    return %[[RES0]] : tensor<130x130xf32>
+
+
+// -----
+
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This applies continuous tiling to the two inner nested loops in linalg.matmul
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] continuous_tiles=[false, true, true] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (4, -d0 + 130)>
+// CHECK: #{{.*}} = affine_map<(d0) -> (d0 - 1)>
+// CHECK: #[[$MAP2:.+]] = affine_map<() -> (2)>
+// CHECK: #[[$MAP1:.+]] = affine_map<() -> (1)>
+
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+// CHECK:    %[[C0:.+]] = arith.constant 0
+// CHECK:    %[[C130:.+]] = arith.constant 130 : index
+// CHECK:    %[[C4:.+]] = arith.constant 4 : index
+// CHECK:    %{{.*}} = scf.for %[[IV0:.+]] = %[[C0]] to %[[C130]] step %[[C4]] iter_args(%[[OUTARG:.+]] = %[[OUT]]) -> (tensor<130x130xf32>) {
+// CHECK:      %[[AM0:.+]] = affine.min #map(%[[IV0]])
+// CHECK:      %[[C2:.+]] = arith.constant 2 : index
+// CHECK:      %[[C128:.+]] = arith.constant 128 : index
+// CHECK:      %[[L1RES0:.+]] = scf.for %[[IV1:.+]] = %[[C0]]{{.*}} to %[[C128]] step %[[C4]]{{.*}} iter_args(%[[OUTARGI:.+]] = %[[OUTARG]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #map(%[[IV1]])
+// CHECK:        %[[ILRES0:.+]] = scf.for [[IV2:.+]] = %[[C0]]{{.*}} to %[[C128]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTARG0:.+]] = %[[OUTARGI]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map([[IV2]])
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS1:.+]] = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS2:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %[[XS1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XS2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INS:.+]] : tensor<130x130xf32>
+// CHECK:        %[[C1:.+]] = arith.constant 1 : index
+// CHECK:        %[[ILRES1:.+]] = scf.for [[IV2:.+]] = %[[C128]]{{.*}} to %[[C130]]{{.*}} step %[[C2]]{{.*}} iter_args(%[[OUTARG0:.+]] = %[[ILRES0]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map2()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS1:.+]] = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS2:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %[[XS1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XS2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INS:.+]] : tensor<130x130xf32>
+// CHECK:        %[[ILRES2:.+]] = scf.for [[IV2:.+]] = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]]{{.*}} iter_args(%[[OUTARG0:.+]] = %[[ILRES1]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map3()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS1:.+]] = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS2:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %[[XS1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XS2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTARG0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INS:.+]] : tensor<130x130xf32>
+// CHECK:        scf.yield %[[ILRES2:.+]] : tensor<130x130xf32>
+// CHECK:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK:      %{{.*}} = scf.for %[[IV1:.+]] = %[[C128]] to %[[C130]]{{.*}} step %[[C2]] iter_args(%[[L1RES0ARG:.+]] = %[[L1RES0]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #map2()
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C0]]{{.*}} to %[[C128]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTARG1:.+]] = %[[L1RES0ARG]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map([[IV2]])
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARG1]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARG1]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C128]]{{.*}} to %[[C130]]{{.*}} step %[[C2]]{{.*}} iter_args(%[[OUTARGX:.+]] = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map2()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]]{{.*}} iter_args(%[[OUTARGX:.+]] = {{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map3()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:      %[[RESL1TE:.+]] = scf.for %[[IV1:.+]] = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]] iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #map3()
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C0]]{{.*}} to %[[C128]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTARGX:.+]] = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map([[IV2]])
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C128]]{{.*}} to %[[C130]]{{.*}} step %[[C2]]{{.*}} iter_args(%[[OUTARGX:.+]] = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map2()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        %{{.*}} = scf.for [[IV2:.+]] = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]]{{.*}} iter_args(%[[OUTARGX:.+]] = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #map3()
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], [[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[IN1]][[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.extract_slice %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.*}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %{{.*}} = tensor.insert_slice %[[MM]] into %[[OUTARGX]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:        scf.yield %{{.*}} : tensor<130x130xf32>
+// CHECK:      scf.yield %[[RESL1TE]] : tensor<130x130xf32>
+
+
+// -----
+
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This applies continuous tiling to all nested loops in linalg.matmul.
+// This test checks that the function return the result of the tail-end
+// outermost loop.
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] continuous_tiles=[true, true, true] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+
+// CHECK:           %[[C0:.+]] = arith.constant 0
+// CHECK:           %[[C130:.+]] = arith.constant 130 : index
+// CHECK:           %[[C4:.+]] = arith.constant 4 : index
+// CHECK:           %[[C2:.+]] = arith.constant 2 : index
+// CHECK:           %[[C128:.+]] = arith.constant 128 : index
+// CHECK:           %[[OLRES0:.+]] = scf.for %{{.*}} = %[[C0]] to %[[C128]] step %[[C4]] iter_args(%{{.*}} = %[[OUT]]) -> (tensor<130x130xf32>)
+// CHECK:           %[[C1:[c][0-9]+]] = arith.constant 1 : index
+// CHECK-NEXT:      %{{.*}} = arith.constant 130 : index
+// CHECK:           %[[OLRES1:.+]] = scf.for %{{.*}} = %[[C128]] to %[[C130]]{{.*}} step %[[C2]] iter_args(%{{.*}} = %[[OLRES0]]) -> (tensor<130x130xf32>)
+// CHECK:           %[[OLRES2:.+]] = scf.for %{{.*}} = %[[C130]]{{.*}} to %[[C130]] step %[[C1]] iter_args(%{{.*}} = %[[OLRES1]]) -> (tensor<130x130xf32>)
+// CHECK:           return %[[OLRES2]] : tensor<130x130xf32>
+
+
+// -----
+
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This applies no continuous tiling to any loop in linalg.matmul.
+// All values in continuous_tiles are set to false.
+// This test checks that the result is equivalent to regular tiling, 
+// i.e. when continuous_tiles is none or not supplied.
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] continuous_tiles=[false, false, false] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (4, -d0 + 130)>
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+// CHECK:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK:    %[[C130:.+]] = arith.constant 130 : index
+// CHECK:    %[[C4:.+]] = arith.constant 4 : index
+// CHECK:    %[[RL0:.+]] = scf.for %[[IV0:.+]] = %[[C0]] to %[[C130]] step %[[C4]] iter_args(%[[OUTL0:.+]] = %[[OUT]]) -> (tensor<130x130xf32>) {
+// CHECK:      %[[AM0:.+]] = affine.min #[[$MAP0]](%[[IV0]])
+// CHECK:      %[[RL1:.+]] = scf.for %[[IV1:.+]] = %[[C0]]{{.*}} to %[[C130]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTL1:.+]] = %[[OUTL0]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #[[$MAP0]](%[[IV1]])
+// CHECK:        %[[RL2:.+]] = scf.for %[[IV2:.+]] = %[[C0]]{{.*}} to %[[C130]]{{.*}} step %[[C4]]{{.*}} iter_args(%[[OUTL2:.+]] = %[[OUTL1]]) -> (tensor<130x130xf32>) {
+// CHECK:          %[[AM2:.+]] = affine.min #[[$MAP0]](%[[IV2]])
+// CHECK:          %[[XS:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV2]]] [%[[AM0]], %[[AM2]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS1:.+]] = tensor.extract_slice %[[IN1]][%[[IV2]], %[[IV1]]] [%[[AM2]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[XS2:.+]] = tensor.extract_slice %[[OUTL2]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:          %[[MM:.+]] = linalg.matmul ins(%[[XS]], %[[XS1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[XS2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK:          %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTL2]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<?x?xf32> into tensor<130x130xf32>
+// CHECK:          scf.yield %[[INS]] : tensor<130x130xf32>
+// CHECK:        scf.yield %[[RL2]] : tensor<130x130xf32>
+// CHECK:      scf.yield %[[RL1]] : tensor<130x130xf32>
+// CHECK:    return %[[RL0]] : tensor<130x130xf32>
+
+
+// -----
+
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This tests that continuous tiling works correctly when interchange is applied.
+// We only check for loop ordering and that correct results are yielded.
+// We use different tile sizes to identify that loops are interchanged
+// properly. All loops are moved from their original nesting by supplying [2, 0, 1]
+// for interchange.
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 8, 2] interchange=[2, 0, 1] continuous_tiles=[true, true, true] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (4, -d0 + 130)>
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+// CHECK:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK:    %[[C130:.+]] = arith.constant 130 : index
+// CHECK:    %[[C2:.+]] = arith.constant 2 : index
+// CHECK:    %[[C1:.+]] = arith.constant 1 : index
+// CHECK:    %[[L0RES:.+]] = scf.for %{{.*}} = %[[C0]] to %[[C130]]{{.*}} step %[[C2]] iter_args(%{{.*}} = %[[OUT]]) -> (tensor<130x130xf32>) {
+// CHECK:      %[[C4:.+]] = arith.constant 4 : index
+// CHECK:      %[[C128:.+]] = arith.constant 128 : index
+// CHECK:      %{{.*}} = scf.for %{{.*}} = %[[C0]]{{.*}} to %[[C128]] step %[[C4]] iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:        %[[C8:.+]] = arith.constant 8 : index
+// CHECK:        %{{.*}} = scf.for %{{.*}} = %[[C0]]{{.*}} to %[[C128]]{{.*}} step %[[C8]] iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:        %{{.*}} = scf.for %{{.*}} = %[[C128]]{{.*}} to %[[C128]]{{.*}} step %[[C4]]{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:        %{{.*}} = scf.for %{{.*}} = %[[C128]]{{.*}} to %[[C130]]{{.*}} step %[[C2]]{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:        %{{.*}} = scf.for %{{.*}} = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]]{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:      %{{.*}} = scf.for %{{.*}} = %[[C128]] to %[[C130]]{{.*}} step %[[C2]]{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:      %{{.*}} = scf.for %{{.*}} = %[[C130]]{{.*}} to %[[C130]]{{.*}} step %[[C1]]{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (tensor<130x130xf32>) {
+// CHECK:    %[[L1RES:.+]] = scf.for %{{.*}} = %[[C130]]{{.*}} to %[[C130]] step %[[C1]] iter_args(%{{.*}} = %[[L0RES]]) -> (tensor<130x130xf32>) {
+// CHECK:    return %[[L1RES]] : tensor<130x130xf32>
+
+
+// -----
+
+// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
+
+// This tests that continuous tiling works correctly when a tile size is zero.
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:2 = transform.structured.tile_using_for %0 [4, 0, 4] continuous_tiles=[false, false, true] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (4, -d0 + 130)>
+
+// CHECK-LABEL: @tile_linalg_matmul
+// CHECK-SAME: %[[IN0:.+]]: tensor<130x130xf32>, %[[IN1:.+]]: tensor<130x130xf32>, %[[OUT:.+]]: tensor<130x130xf32>
+func.func @tile_linalg_matmul(
+  %arg0: tensor<130x130xf32>, %arg1: tensor<130x130xf32>, %arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<130x130xf32>, tensor<130x130xf32>)
+                     outs(%arg2: tensor<130x130xf32>)
+    -> tensor<130x130xf32>
+
+  return %0 : tensor<130x130xf32>
+}
+
+// CHECK:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK:    %[[C130:.+]] = arith.constant 130 : index
+// CHECK:    %[[C4:.+]] = arith.constant 4 : index
+// CHECK:    %[[OLRES:.+]] = scf.for %[[IV0:.+]] = %[[C0]] to %[[C130]] step %[[C4]] iter_args(%[[OUTARG:.+]] = %[[OUT]]) -> (tensor<130x130xf32>) {
+// CHECK:      %[[AM0:.+]] = affine.min #map(%[[IV0]])
+// CHECK:      %[[C2:.+]] = arith.constant 2 : index
+// CHECK:      %[[C128:.+]] = arith.constant 128 : index
+// CHECK:      %[[IL0R:.+]] = scf.for %[[IV1:.+]] = %[[C0]]{{.*}} to %[[C128]] step %[[C4]]{{.*}} iter_args(%[[OUTARG0:.+]] = %[[OUTARG]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #map(%[[IV1]])
+// CHECK:        %[[XS0:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:        %[[XS1:.+]] = tensor.extract_slice %[[IN1]][%[[IV1]], 0] [%[[AM1]], 130] [1, 1] : tensor<130x130xf32> to tensor<?x130xf32>
+// CHECK:        %[[XS2:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], 0] [%[[AM0]], 130] [1, 1] : tensor<130x130xf32> to tensor<?x130xf32>
+// CHECK:        %[[MM:.+]] = linalg.matmul ins(%[[XS0]], %[[XS1]] : tensor<?x?xf32>, tensor<?x130xf32>) outs(%[[XS2]] : tensor<?x130xf32>) -> tensor<?x130xf32>
+// CHECK:        %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTARG0]][%[[IV0]], 0] [%[[AM0]], 130] [1, 1] : tensor<?x130xf32> into tensor<130x130xf32>
+// CHECK:        scf.yield %[[INS]] : tensor<130x130xf32>
+// CHECK:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK:      %[[IL1R:.+]] = scf.for %[[IV1:.+]] = %c128 to %[[C130]]{{.*}} step %[[C2]] iter_args(%[[OUTARG0]] = %[[IL0R]]) -> (tensor<130x130xf32>) {
+// CHECK:        %[[AM1:.+]] = affine.min #map2()
+// CHECK:        %[[XS0:.+]] = tensor.extract_slice %[[IN0]][%[[IV0]], %[[IV1]]] [%[[AM0]], %[[AM1]]] [1, 1] : tensor<130x130xf32> to tensor<?x?xf32>
+// CHECK:        %[[XS1:.+]] = tensor.extract_slice %[[IN1]][%[[IV1]], 0] [%[[AM1]], 130] [1, 1] : tensor<130x130xf32> to tensor<?x130xf32>
+// CHECK:        %[[XS2:.+]] = tensor.extract_slice %[[OUTARG0]][%[[IV0]], 0] [%[[AM0]], 130] [1, 1] : tensor<130x130xf32> to tensor<?x130xf32>
+// CHECK:        %[[MM:.+]] = linalg.matmul ins(%[[XS0]], %[[XS1]] : tensor<?x?xf32>, tensor<?x130xf32>) outs(%[[XS2]] : tensor<?x130xf32>) -> tensor<?x130xf32>
+// CHECK:        %[[INS:.+]] = tensor.insert_slice %[[MM]] into %[[OUTARG0]][%[[IV0]], 0] [%[[AM0]], 130] [1, 1] : tensor<?x130xf32> into tensor<130x130xf32>
+// CHECK:        scf.yield %[[INS]] : tensor<130x130xf32>
+// CHECK:      %[[IL2R:.+]] = scf.for %{{.*}} = %[[C130]]{{.*}} to %[[C130]]{{.*}} step  %[[C1]] iter_args(%[[OUTARG0]] = %[[IL1R]]) -> (tensor<130x130xf32>) {
+// CHECK:      scf.yield %[[IL2R]] : tensor<130x130xf32>
+// CHECK:    return %[[OLRES]] : tensor<130x130xf32>
\ No newline at end of file