[Mlir-commits] [mlir] 1a829d2 - [mlir] Purge linalg.tiled_loop.

Mon Feb 28 00:06:50 PST 2022

Author: Alexander Belyaev
Date: 2022-02-28T09:05:18+01:00
New Revision: 1a829d2d06958abf09bb6aff81120959206887f6

URL: https://github.com/llvm/llvm-project/commit/1a829d2d06958abf09bb6aff81120959206887f6
DIFF: https://github.com/llvm/llvm-project/commit/1a829d2d06958abf09bb6aff81120959206887f6.diff

LOG: [mlir] Purge linalg.tiled_loop.

Differential Revision: https://reviews.llvm.org/D119415

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
    mlir/include/mlir/Dialect/Linalg/Passes.h
    mlir/include/mlir/Dialect/Linalg/Passes.td
    mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
    mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
    mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
    mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
    mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
    mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
    mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
    mlir/lib/Dialect/Linalg/Utils/Utils.cpp
    mlir/test/Dialect/Linalg/canonicalize.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
    mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
    mlir/test/Dialect/Linalg/invalid.mlir
    mlir/test/Dialect/Linalg/roundtrip.mlir
    mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
    mlir/test/Dialect/Linalg/tile-tensors.mlir
    mlir/test/lib/Dialect/Linalg/CMakeLists.txt
    mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
    mlir/tools/mlir-opt/mlir-opt.cpp

Removed: 
    mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
    mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
    mlir/test/Dialect/Linalg/tiled-loop-peeling.mlir
    mlir/test/Dialect/Linalg/tiled-loop-to-scf.mlir
    mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index 518a2cfacf2d5..0f896df15119a 100644

--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -138,290 +138,6 @@ def Linalg_YieldOp : Linalg_Op<"yield", [NoSideEffect, ReturnLike, Terminator]>,
   let hasVerifier = 1;
 }
 
-def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [
-     AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-     RecursiveSideEffects,
-     SingleBlockImplicitTerminator<"linalg::YieldOp">
-    ]> {
-  let summary = "Linalg tiled loop operation";
-  let description = [{
-    This is a loop-like operation with additional properties. The arguments
-    also include the input and the output tensors or memrefs and the attributes
-    to specify the iterator types.
-
-    Parsing TiledLoopOp will set all elements of the `iterator_types` attribute
-    to "parallel" type, when it is absent from the custom format.
-
-    Tensor-based version:
-
-    The body region of the loop contains `extract_slice` operations applied to
-    every tensor argument of TiledLoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `linalg.yield` with the operands resulting from `insert_slice` operations.
-
-    Example:
-
-    ```mlir
-    %0 = linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
-        ins(%lhs, %rhs : tensor<24x64xi8>, tensor<24x64xi8>)
-        outs(%out : tensor<24x64xi8>)
-        iterators("parallel")
-        distribution("block_x") {
-      %lhs_sub = tensor.extract_slice %lhs[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-      %rhs_sub = tensor.extract_slice %rhs[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-      %out_sub = tensor.extract_slice %out[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-
-      %result_sub = linalg.generic ...
-
-      %result = tensor.insert_slice %result_sub into %out[%i, 0][%c4, %c64][1, 1]
-        : tensor<?x?xi8> into tensor<24x64xi8>
-      linalg.yield %result : tensor<24x64xi8>
-    }
-    ```
-
-    MemRef-based version:
-
-    The body region of the loop contains `subview` operations applied to
-    every memref argument of TiledLoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `linalg.yield` with no operands.
-
-    Example:
-
-    ```mlir
-    linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
-        ins(%lhs, %rhs : memref<24x64xi8>, memref<24x64xi8>)
-        outs(%out : memref<24x64xi8>)
-        iterators("parallel")
-        distribution("block_x") {
-      %lhs_sub = subview %lhs[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-      %rhs_sub = subview %rhs[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-      %out_sub = subview %out[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-
-      %result_sub = linalg.generic ...
-      linalg.yield
-    }
-    ```
-  }];
-
-  let arguments = (ins Variadic<Index>:$lowerBound,
-                       Variadic<Index>:$upperBound,
-                       Variadic<Index>:$step,
-                       Variadic<AnyType>:$inputs,
-                       Variadic<AnyShaped>:$outputs,
-                       ArrayAttr:$iterator_types,
-                       OptionalAttr<ArrayAttr>:$distribution_types);
-  let results = (outs Variadic<AnyRankedTensor>:$results);
-  let regions = (region SizedRegion<1>:$region);
-
-  let builders = [
-    OpBuilder<(ins "ValueRange":$lowerBounds, "ValueRange":$upperBounds,
-      "ValueRange":$steps, "ValueRange":$inputs, "ValueRange":$outputs,
-      "ArrayAttr":$iteratorTypes, "Optional<ArrayAttr>":$distributionTypes,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
-        "/*inputs=*/ValueRange, /*outputs=*/ValueRange)>",
-        "nullptr">:$bodyBuilderFn)>,
-    OpBuilder<(ins "ValueRange":$lowerBounds, "ValueRange":$upperBounds,
-      "ValueRange":$steps, "ValueRange":$inputs, "ValueRange":$outputs,
-      "ArrayAttr":$iteratorTypes,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
-        "/*inputs=*/ValueRange, /*outputs=*/ValueRange)>",
-        "nullptr">:$bodyBuilderFn)>,
-  ];
-
-  let extraClassDeclaration = [{
-    /// Number of loops
-    unsigned getNumLoops() { return step().size(); }
-
-    /// Number of input operands
-    unsigned getNumInputs() { return inputs().size(); }
-
-    /// Number of output operands
-    unsigned getNumOutputs() { return outputs().size(); }
-
-    /// Number of operands controlling the loop: lbs, ubs, steps
-    unsigned getNumControlOperands() { return 3 * getNumLoops(); }
-
-    ValueRange getInductionVars() {
-      return getBody()->getArguments().take_front(getNumLoops());
-    }
-    ValueRange getRegionInputArgs() {
-      return getBody()->getArguments().slice(getNumLoops(), inputs().size());
-    }
-    ValueRange getRegionOutputArgs() {
-      return getBody()->getArguments().take_back(outputs().size());
-    }
-
-    void setDistributionTypes(Builder& b, ArrayRef<StringRef> types) {
-      assert(types.size() == getNumLoops() &&
-             "expected distribution type for every dimension");
-      distribution_typesAttr(b.getStrArrayAttr(types));
-    }
-
-    void setLowerBounds(ValueRange lowerBounds) {
-      unsigned numLoops = getNumLoops();
-      assert(lowerBounds.size() == numLoops &&
-             "expected lower bounds for every loop dimension");
-      for (unsigned i = 0; i < numLoops; ++i)
-        setOperand(i, lowerBounds[i]);
-    }
-
-    void setUpperBounds(ValueRange upperBounds) {
-      unsigned numLoops = getNumLoops();
-      assert(upperBounds.size() == numLoops &&
-             "expected upper bounds for every loop dimension");
-      for (unsigned i = 0, pos = numLoops; i < numLoops; ++i, ++pos)
-        setOperand(pos, upperBounds[i]);
-    }
-
-    void setSteps(ValueRange steps) {
-      unsigned numLoops = getNumLoops();
-      assert(steps.size() == numLoops &&
-             "expected upper bounds for every loop dimension");
-      for (unsigned i = 0, pos = 2 * numLoops; i < numLoops; ++i, ++pos)
-        setOperand(pos, steps[i]);
-    }
-
-    /// Operand that corresponds to the `bbArg` block argument.
-    OpOperand& getTiedOperand(BlockArgument& bbArg) {
-      return getOperation()->getOpOperand(getNumControlOperands() +
-                                          bbArg.getArgNumber() - getNumLoops());
-    }
-
-    /// Block argument that corresponds to the `input` or `output` operand.
-    BlockArgument getTiedBlockArgument(OpOperand& operand) {
-      auto operandIndex = operand.getOperandNumber();
-      assert(
-          operandIndex >= getNumControlOperands() &&
-          operandIndex < getNumOperands() &&
-          "tied block arg is defined only for `input` and `output` arguments");
-      return getBody()->getArgument(operandIndex - 2 * getNumLoops());
-    }
-
-   /// Result that corresponds to the `outputs` argument of tensor type.
-   OpResult getTiedOpResult(OpOperand& opOperand) {
-      // No result can correspond to a memref argument.
-      if (opOperand.get().getType().isa<MemRefType>()) return OpResult();
-
-      // Check whether the operand index is in bounds of `outputs()` arg.
-      int operandIndex = opOperand.getOperandNumber();
-      int outputIndexStart =
-          getNumControlOperands() + inputs().size();
-      int outputIndexEnd = outputIndexStart + outputs().size();
-      if (operandIndex < outputIndexStart || operandIndex >= outputIndexEnd)
-        return OpResult();
-
-      // Count tensor arguments in `outputs` to compute the result index.
-      int tensorId = -1;
-      for (int i = outputIndexStart; i <= operandIndex; ++i)
-        tensorId += getOperand(i).getType().isa<RankedTensorType>();
-      return getOperation()->getResult(tensorId);
-    }
-
-    /// Append `operand` to the `input` arguments.
-    OpOperand& appendInputOperand(OpBuilder& builder, Value operand) {
-      int numLoops = getNumLoops();
-      int numInputs = getNumInputs();
-      int numOutputs = getNumOutputs();
-
-      getOperation()->insertOperands(getNumControlOperands() + numInputs,
-                                     operand);
-      getBody()->insertArgument(numLoops + numInputs, operand.getType(), 
-                                getLoc());
-      getOperation()->setAttr(
-          TiledLoopOp::getOperandSegmentSizeAttr(),
-          builder.getI32VectorAttr(
-              {numLoops, numLoops, numLoops, numInputs + 1, numOutputs}));
-      return getOperation()->getOpOperand(getNumControlOperands() + numInputs);
-    }
-
-    /// Append `operand` to the `output` arguments.
-    OpOperand& appendOutputOperand(OpBuilder& builder, Value operand) {
-      int numLoops = getNumLoops();
-      int numInputs = getNumInputs();
-      int numOutputs = getNumOutputs();
-
-      getOperation()->insertOperands(
-          getNumControlOperands() + numInputs + numOutputs, operand);
-      getBody()->insertArgument(numLoops + numInputs + numOutputs,
-                                operand.getType(), getLoc());
-      getOperation()->setAttr(
-          TiledLoopOp::getOperandSegmentSizeAttr(),
-          builder.getI32VectorAttr(
-              {numLoops, numLoops, numLoops, numInputs, numOutputs + 1}));
-      return getOperation()->getOpOperand(getNumControlOperands() + numInputs +
-                                          numOutputs);
-    }
-
-    /// Erase `operand` from the `input` or `output` arguments.
-    void eraseOperand(OpBuilder& builder, OpOperand& operand) {
-      int numInputs = getNumInputs();
-      int numLoops = getNumLoops();
-      int numOutputs = getNumOutputs();
-      int numControlOperands = getNumControlOperands();
-
-      int operandIndex = operand.getOperandNumber();
-      assert(operandIndex >= numControlOperands &&
-             operandIndex < static_cast<int>(getNumOperands()) &&
-             "Can erase only `input` or `output` operand");
-
-      if (operandIndex >= numControlOperands + numInputs)
-        --numOutputs;
-      else
-        --numInputs;
-
-      getOperation()->eraseOperand(operandIndex);
-      getBody()->eraseArgument(operandIndex - 2 * numLoops);
-      getOperation()->setAttr(
-          TiledLoopOp::getOperandSegmentSizeAttr(),
-          builder.getI32VectorAttr(
-              {numLoops, numLoops, numLoops, numInputs, numOutputs}));
-    }
-
-    OpOperand* findInputOperand(Value value) {
-      OperandRange::iterator it = llvm::find(inputs(), value);
-      if (it == inputs().end()) return nullptr;
-      return it.getBase();
-    }
-
-    OpOperand* findOutputOperand(Value value) {
-      OperandRange::iterator it = llvm::find(outputs(), value);
-      if (it == outputs().end()) return nullptr;
-      return it.getBase();
-    }
-
-    /// Return whether the op has only MemRef input and outputs.
-    bool hasBufferSemantics() {
-      Operation* op = this->getOperation();
-      return op->getNumResults() == 0 &&
-             llvm::all_of(op->getOpOperands(), [&](OpOperand & operand) {
-               return !operand.get().getType().template isa<ShapedType>() ||
-                      operand.get().getType().template isa<MemRefType>();
-             });
-    }
-
-    /// Return whether the loop dimension is parallel or not.
-    bool isParallelDimension(unsigned dim) {
-      StringAttr attr = this->iterator_types()[dim].cast<StringAttr>();
-      return attr.getValue() == getParallelIteratorTypeName();
-    }
-  }];
-
-  let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
-  let hasFolder = 1;
-  let hasVerifier = 1;
-}
-
 def Linalg_IndexOp : Linalg_Op<"index", [NoSideEffect]>,
     Arguments<(ins Confined<I64Attr, [IntMinValue<0>]>:$dim)>,
     Results<(outs Index:$result)> {

diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index 487362c62e60a..3f8719b0782b5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -31,10 +31,10 @@ std::unique_ptr<Pass> createFoldReshapeOpsByLinearizationPass();
 
 std::unique_ptr<Pass> createLinalgNamedOpConversionPass();
 
-std::unique_ptr<OperationPass<FuncOp>> createLinalgTilingPass(
-    ArrayRef<int64_t> tileSizes = {},
-    linalg::LinalgTilingLoopType loopType = linalg::LinalgTilingLoopType::Loops,
-    ArrayRef<StringRef> distributionTypes = {});
+std::unique_ptr<OperationPass<FuncOp>>
+createLinalgTilingPass(ArrayRef<int64_t> tileSizes = {},
+                       linalg::LinalgTilingLoopType loopType =
+                           linalg::LinalgTilingLoopType::Loops);
 
 std::unique_ptr<OperationPass<FuncOp>>
 createLinalgPromotionPass(bool dynamicBuffers, bool useAlloca);
@@ -42,10 +42,6 @@ std::unique_ptr<OperationPass<FuncOp>> createLinalgPromotionPass();
 
 std::unique_ptr<OperationPass<FuncOp>> createLinalgInlineScalarOperandsPass();
 
-/// Create a pass to convert Linalg tiled loops to `scf.for` and `scf.parallel`
-/// loops and memref.load/memref.store accesses.
-std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgTiledLoopsToSCFPass();
-
 /// Create a pass to convert Linalg operations to scf.for loops and
 /// memref.load/memref.store accesses.
 std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToLoopsPass();

diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index dc14011c8fd13..22989386f6b95 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -121,17 +121,6 @@ def LinalgNamedOpConversion: Pass<"linalg-named-op-conversion"> {
   let dependentDialects = ["linalg::LinalgDialect", "tensor::TensorDialect"];
 }
 
-def LinalgLowerTiledLoopsToSCF
-    : Pass<"convert-linalg-tiled-loops-to-scf", "FuncOp"> {
-  let summary = "Lower linalg tiled loops to SCF loops and parallel loops";
-  let constructor = "mlir::createConvertLinalgTiledLoopsToSCFPass()";
-  let dependentDialects = [
-    "linalg::LinalgDialect",
-    "scf::SCFDialect",
-    "AffineDialect"
-  ];
-}
-
 def LinalgInlineScalarOperands : Pass<"linalg-inline-scalar-operands", "FuncOp"> {
   let summary = "Inline scalar operands into linalg generic ops";
   let constructor = "mlir::createLinalgInlineScalarOperandsPass()";
@@ -207,12 +196,7 @@ def LinalgTiling : Pass<"linalg-tile", "FuncOp"> {
     ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
     Option<"loopType", "loop-type", "std::string", /*default=*/"\"for\"",
-           "Specify the type of loops to generate: for, parallel or "
-           "tiled_loop">,
-    ListOption<"distributionTypes", "distribution-types", "std::string",
-               "DistributionTypes (if loop-type=tiled_loop)",
-               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
-
+           "Specify the type of loops to generate: for, parallel">
   ];
 }
 

diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 50e6191db5e8a..4c0146056aa72 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -131,9 +131,6 @@ void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns);
 /// Patterns that are used to inline constant operands into linalg generic ops.
 void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns);
 
-/// Pattern to convert TiledLoopOp to SCF loops.
-void populateTiledLoopToSCFPattern(RewritePatternSet &patterns);
-
 /// Options that control fusion of elementwise operations.
 struct LinalgElementwiseFusionOptions {
   /// Enable fusion of reshapes into the shape with elementwise operations. By
@@ -1248,13 +1245,6 @@ void populateDecomposeConvolutionPatterns(
     const LinalgTransformationFilter &filter = LinalgTransformationFilter(),
     PatternBenefit benefit = 1);
 
-/// Linalg distribution patterns
-//
-/// Populates `patterns` with patterns to distribute linalg.tiled_loop.
-void populateLinalgDistributeTiledLoopPattern(
-    RewritePatternSet &patterns, const LinalgLoopDistributionOptions &opts,
-    const LinalgTransformationFilter &marker);
-
 //===----------------------------------------------------------------------===//
 // Op-specific patterns.
 //===----------------------------------------------------------------------===//
@@ -1368,31 +1358,6 @@ struct LinalgCopyVTWForwardingPattern
                                 PatternRewriter &rewriter) const override;
 };
 
-/// Rewrite a TiledLoopOp with bounds/step that potentially do not divide evenly
-/// into a TiledLoopOp where the step divides the iteration space evenly,
-/// followed by another TiledLoopOp for the last (partial) iteration (if any).
-/// This transformation is called "loop peeling".
-///
-/// This function peels the `idx`-th loop of the TiledLoopOp. To tile all loops
-/// in the loop nest, this function must be called multiple times.
-///
-/// After loop peeling, this function tries to simplify/canonicalize affine.min
-/// and affine.max ops in the body of the two TiledLoopOps. For more details,
-/// refer to `mlir::scf::peelAndCanonicalizeForLoop`.
-///
-/// The return value indicates whether the loop was rewritten or not. Loops are
-/// not rewritten if:
-/// * Loop step size is 1 or
-/// * Loop bounds and step size are static, and step already divides the
-///   iteration space evenly.
-///
-/// Note: This function rewrites the given TiledLoopOp in-place and clones the
-/// TileLoopOp operation for the last iteration. It replaces all uses of the
-/// unpeeled TiledLoopOp with the results of the newly generated TiledLoopOp.
-LogicalResult peelAndCanonicalizeTiledLoop(RewriterBase &rewriter,
-                                           TiledLoopOp loopOp, int64_t idx,
-                                           TiledLoopOp &result);
-
 //===----------------------------------------------------------------------===//
 // Support for staged pattern application.
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 44179ebe60757..0a556b3d99eb6 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -105,44 +105,6 @@ static LogicalResult foldMemRefCast(Operation *op) {
   return success(folded);
 }
 
-/// This is a specialization of `foldMemRefCast` used for patterns of the form
-/// ```
-///    tiled_loop(memrefcast(%src)) -> tiled_loop(%src)
-/// ```
-/// It folds the source of the memref.cast into the root operation directly.
-static LogicalResult foldMemRefCastInTiledLoopOp(TiledLoopOp op) {
-  bool folded = false;
-  Location loc = op->getLoc();
-
-  Block *body = op.getBody();
-  OpBuilder b = OpBuilder::atBlockBegin(body);
-
-  // Update `input` and `output` operands and block arguments if necessary.
-  // Operands list: [lbs, ubs, steps, inputs, outputs].
-  // Block args list: [ivs, inputs, outputs].
-  for (size_t operandIndex = op.getNumControlOperands(),
-              bbArgIndex = op.getNumLoops(), e = op.getNumOperands();
-       operandIndex < e; ++operandIndex, ++bbArgIndex) {
-    OpOperand &operand = op->getOpOperand(operandIndex);
-
-    auto castOp = operand.get().getDefiningOp<memref::CastOp>();
-    if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) {
-      operand.set(castOp.getOperand());
-      BlockArgument newBbArg = body->insertArgument(
-          bbArgIndex, castOp.getOperand().getType(), op.getLoc());
-      BlockArgument oldBbArg = body->getArgument(newBbArg.getArgNumber() + 1);
-
-      // Insert memref.cast back to the original type.
-      oldBbArg.replaceAllUsesWith(
-          b.create<memref::CastOp>(loc, oldBbArg.getType(), newBbArg));
-      body->eraseArgument(oldBbArg.getArgNumber());
-
-      folded = true;
-    }
-  }
-  return success(folded);
-}
-
 //===----------------------------------------------------------------------===//
 // Region builder helper.
 // TODO: Move this to a utility library.
@@ -1247,630 +1209,9 @@ LogicalResult linalg::YieldOp::verify() {
   if (auto linalgOp = dyn_cast<LinalgOp>(parentOp))
     return verifyYield(*this, cast<LinalgOp>(parentOp));
 
-  if (auto tiledLoopOp = dyn_cast<linalg::TiledLoopOp>(parentOp)) {
-    // Check if output args with tensor types match results types.
-    SmallVector<Value, 2> tensorOuts;
-    llvm::copy_if(
-        tiledLoopOp.outputs(), std::back_inserter(tensorOuts),
-        [&](Value out) { return out.getType().isa<RankedTensorType>(); });
-    if (tensorOuts.size() != values().size())
-      return emitOpError("expected number of tensor output args = ")
-             << tensorOuts.size()
-             << " to match the number of yield operands = " << values().size();
-
-    TypeRange tensorTypes(llvm::makeArrayRef(tensorOuts));
-    for (auto &item :
-         llvm::enumerate(llvm::zip(tensorTypes, getOperandTypes()))) {
-      Type outType, resultType;
-      unsigned index = item.index();
-      std::tie(outType, resultType) = item.value();
-      if (outType != resultType)
-        return emitOpError("expected yield operand ")
-               << index << " with type = " << resultType
-               << " to match output arg type = " << outType;
-    }
-    return success();
-  }
   return emitOpError("expected parent op with LinalgOp interface");
 }
 
-//===----------------------------------------------------------------------===//
-// TiledLoopOp
-//===----------------------------------------------------------------------===//
-
-void TiledLoopOp::build(OpBuilder &builder, OperationState &result,
-                        ValueRange lowerBounds, ValueRange upperBounds,
-                        ValueRange steps, ValueRange inputs, ValueRange outputs,
-                        ArrayAttr iteratorTypes,
-                        function_ref<void(OpBuilder &, Location, ValueRange,
-                                          ValueRange, ValueRange)>
-                            bodyBuilderFn) {
-  build(builder, result, lowerBounds, upperBounds, steps, inputs, outputs,
-        iteratorTypes, llvm::None, bodyBuilderFn);
-}
-
-void TiledLoopOp::build(OpBuilder &builder, OperationState &result,
-                        ValueRange lowerBounds, ValueRange upperBounds,
-                        ValueRange steps, ValueRange inputs, ValueRange outputs,
-                        ArrayAttr iteratorTypes,
-                        Optional<ArrayAttr> distributionTypes,
-                        function_ref<void(OpBuilder &, Location, ValueRange,
-                                          ValueRange, ValueRange)>
-                            bodyBuilderFn) {
-  result.addOperands(lowerBounds);
-  result.addOperands(upperBounds);
-  result.addOperands(steps);
-  result.addOperands(inputs);
-  result.addOperands(outputs);
-  result.addAttribute(
-      TiledLoopOp::getOperandSegmentSizeAttr(),
-      builder.getI32VectorAttr({static_cast<int32_t>(lowerBounds.size()),
-                                static_cast<int32_t>(upperBounds.size()),
-                                static_cast<int32_t>(steps.size()),
-                                static_cast<int32_t>(inputs.size()),
-                                static_cast<int32_t>(outputs.size())}));
-  result.addAttribute(getIteratorTypesAttrName(), iteratorTypes);
-
-  if (distributionTypes.hasValue())
-    result.addAttribute(getDistributionTypesAttrName(),
-                        distributionTypes.getValue());
-
-  // Add output types for `RankedTensorType` output arguments.
-  for (Value output : outputs) {
-    Type outputType = output.getType();
-    if (outputType.isa<RankedTensorType>())
-      result.addTypes(outputType);
-  }
-
-  OpBuilder::InsertionGuard guard(builder);
-  unsigned numIVs = steps.size();
-  SmallVector<Type, 8> argTypes(numIVs, builder.getIndexType());
-  SmallVector<Location, 8> argLocs(numIVs, result.location);
-  for (Value input : inputs) {
-    argTypes.push_back(input.getType());
-    argLocs.push_back(input.getLoc());
-  }
-  for (Value output : outputs) {
-    argTypes.push_back(output.getType());
-    argLocs.push_back(output.getLoc());
-  }
-  Region *bodyRegion = result.addRegion();
-  Block *bodyBlock = builder.createBlock(bodyRegion, {}, argTypes, argLocs);
-
-  if (bodyBuilderFn) {
-    builder.setInsertionPointToStart(bodyBlock);
-    bodyBuilderFn(builder, result.location,
-                  bodyBlock->getArguments().take_front(numIVs),
-                  bodyBlock->getArguments().slice(numIVs, inputs.size()),
-                  bodyBlock->getArguments().take_back(outputs.size()));
-    TiledLoopOp::ensureTerminator(*bodyRegion, builder, result.location);
-  }
-}
-
-void TiledLoopOp::print(OpAsmPrinter &p) {
-  p << " (" << getInductionVars() << ") = (" << lowerBound() << ") to ("
-    << upperBound() << ") step (" << step() << ")";
-
-  if (!inputs().empty()) {
-    p << " ins (";
-    llvm::interleaveComma(llvm::zip(getRegionInputArgs(), inputs()), p,
-                          [&](auto it) {
-                            p << std::get<0>(it) << " = " << std::get<1>(it)
-                              << ": " << std::get<1>(it).getType();
-                          });
-    p << ")";
-  }
-  if (!outputs().empty()) {
-    p << " outs (";
-    llvm::interleaveComma(llvm::zip(getRegionOutputArgs(), outputs()), p,
-                          [&](auto it) {
-                            p << std::get<0>(it) << " = " << std::get<1>(it)
-                              << ": " << std::get<1>(it).getType();
-                          });
-    p << ")";
-  }
-
-  if (llvm::any_of(iterator_types(), [](Attribute attr) {
-        return attr.cast<StringAttr>().getValue() !=
-               getParallelIteratorTypeName();
-      }))
-    p << " iterators" << iterator_types();
-
-  if (distribution_types().hasValue())
-    p << " distribution" << distribution_types().getValue();
-
-  p << ' ';
-  p.printRegion(region(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
-                              TiledLoopOp::getOperandSegmentSizeAttr(),
-                              getIteratorTypesAttrName(),
-                              getDistributionTypesAttrName()});
-}
-
-ParseResult TiledLoopOp::parse(OpAsmParser &parser, OperationState &result) {
-  auto &builder = parser.getBuilder();
-  // Parse an opening `(` followed by induction variables followed by `)`
-  SmallVector<OpAsmParser::OperandType, 4> ivs;
-  if (parser.parseRegionArgumentList(ivs, /*requiredOperandCount=*/-1,
-                                     OpAsmParser::Delimiter::Paren))
-    return failure();
-
-  // Parse loop bounds.
-  SmallVector<OpAsmParser::OperandType, 4> lower;
-  if (parser.parseEqual() ||
-      parser.parseOperandList(lower, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
-    return failure();
-
-  SmallVector<OpAsmParser::OperandType, 4> upper;
-  if (parser.parseKeyword("to") ||
-      parser.parseOperandList(upper, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
-    return failure();
-
-  // Parse step values.
-  SmallVector<OpAsmParser::OperandType, 4> steps;
-  if (parser.parseKeyword("step") ||
-      parser.parseOperandList(steps, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
-    return failure();
-
-  // Parse input tensors.
-  SmallVector<OpAsmParser::OperandType, 4> inputs, inputRegionArgs;
-  SmallVector<Type, 4> inputTypes;
-  if (succeeded(parser.parseOptionalKeyword("ins"))) {
-    SMLoc inputsOperandsLoc = parser.getCurrentLocation();
-
-    if (parser.parseAssignmentListWithTypes(inputRegionArgs, inputs,
-                                            inputTypes))
-      return failure();
-
-    if (parser.resolveOperands(inputs, inputTypes, inputsOperandsLoc,
-                               result.operands))
-      return failure();
-  }
-
-  // Parse output tensors.
-  SmallVector<OpAsmParser::OperandType, 4> outputs, outputRegionArgs;
-  SmallVector<Type, 4> outputTypes;
-  if (succeeded(parser.parseOptionalKeyword("outs"))) {
-    SMLoc outputsOperandsLoc = parser.getCurrentLocation();
-
-    if (parser.parseAssignmentListWithTypes(outputRegionArgs, outputs,
-                                            outputTypes))
-      return failure();
-
-    if (parser.resolveOperands(outputs, outputTypes, outputsOperandsLoc,
-                               result.operands))
-      return failure();
-    for (Type outputType : outputTypes)
-      if (outputType.isa<RankedTensorType>())
-        result.addTypes(outputType);
-  }
-
-  // Parse attributes.
-  SmallVector<Attribute, 4> iterTypes, distributionTypes;
-  auto parseAttr = [&](StringRef keyword, SmallVector<Attribute, 4> *attrs) {
-    if (succeeded(parser.parseOptionalKeyword(keyword))) {
-      StringAttr attr;
-
-      if (parser.parseLSquare() || parser.parseAttribute(attr))
-        return failure();
-      attrs->push_back(attr);
-      for (int i = 1, e = ivs.size(); i < e; ++i) {
-        if (parser.parseComma() || parser.parseAttribute(attr))
-          return failure();
-        attrs->push_back(attr);
-      }
-      if (parser.parseRSquare())
-        return failure();
-    }
-    return success();
-  };
-  if (failed(parseAttr("iterators", &iterTypes)) ||
-      failed(parseAttr("distribution", &distributionTypes)))
-    return failure();
-
-  // Set all loop iterator types to "parallel" if they are not printed in IR.
-  if (iterTypes.empty()) {
-    auto parallelIter = builder.getStringAttr(getParallelIteratorTypeName());
-    iterTypes = SmallVector<Attribute, 4>(ivs.size(), parallelIter);
-  }
-  result.addAttribute(getIteratorTypesAttrName(),
-                      builder.getArrayAttr(iterTypes));
-  if (!distributionTypes.empty())
-    result.addAttribute(getDistributionTypesAttrName(),
-                        builder.getArrayAttr(distributionTypes));
-  result.addAttribute(
-      TiledLoopOp::getOperandSegmentSizeAttr(),
-      builder.getI32VectorAttr({static_cast<int32_t>(lower.size()),
-                                static_cast<int32_t>(upper.size()),
-                                static_cast<int32_t>(steps.size()),
-                                static_cast<int32_t>(inputs.size()),
-                                static_cast<int32_t>(outputs.size())}));
-
-  // Parse the body.
-  Region *body = result.addRegion();
-
-  SmallVector<Type, 4> regionTypes(ivs.size(), builder.getIndexType());
-  regionTypes.append(inputTypes);
-  regionTypes.append(outputTypes);
-
-  SmallVector<OpAsmParser::OperandType, 4> regionArgs(ivs);
-  regionArgs.append(inputRegionArgs);
-  regionArgs.append(outputRegionArgs);
-
-  if (parser.parseRegion(*body, regionArgs, regionTypes))
-    return failure();
-
-  // Parse optional attributes.
-  parser.parseOptionalAttrDict(result.attributes);
-
-  return success();
-}
-
-Region &TiledLoopOp::getLoopBody() { return region(); }
-
-LogicalResult TiledLoopOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
-  for (auto *op : ops)
-    op->moveBefore(*this);
-  return success();
-}
-
-bool TiledLoopOp::isDefinedOutsideOfLoop(Value value) {
-  return !region().isAncestor(value.getParentRegion());
-}
-
-LogicalResult TiledLoopOp::verify() {
-  // Check if iterator types are provided for every loop dimension.
-  if (iterator_types().size() != getNumLoops())
-    return emitOpError("expected iterator types array attribute size = ")
-           << iterator_types().size()
-           << " to match the number of loops = " << getNumLoops();
-
-  // Check if types of input arguments match region args types.
-  for (auto &item :
-       llvm::enumerate(llvm::zip(inputs(), getRegionInputArgs()))) {
-    Value input, inputRegionArg;
-    unsigned index = item.index();
-    std::tie(input, inputRegionArg) = item.value();
-    if (input.getType() != inputRegionArg.getType())
-      return emitOpError("expected input arg ")
-             << index << " with type = " << input.getType()
-             << " to match region arg " << index + getNumLoops()
-             << " type = " << inputRegionArg.getType();
-  }
-
-  // Check if types of input arguments match region args types.
-  for (auto &item :
-       llvm::enumerate(llvm::zip(outputs(), getRegionOutputArgs()))) {
-    Value output, outputRegionArg;
-    unsigned index = item.index();
-    std::tie(output, outputRegionArg) = item.value();
-    if (output.getType() != outputRegionArg.getType())
-      return emitOpError("expected output arg ")
-             << index << " with type = " << output.getType()
-             << " to match region arg "
-             << index + getNumLoops() + inputs().size()
-             << " type = " << outputRegionArg.getType();
-  }
-  return success();
-}
-
-namespace {
-
-static constexpr int64_t kNoMatch = -1;
-
-// Folds away TiledLoopOp inputs if they have no uses within the body.
-//
-// Example:
-//
-// %0 = linalg.tiled_loop ...  ins (%in_ = %in: tensor<...>,
-//                                  %in_buf_ = %in_buf: memref<...>) {...}
-// Becomes
-//
-// linalg.tiled_loop ...  ins (%in_buf_ = %in_buf: memref<...>) {...}
-struct TiledLoopInputsFolder : public OpRewritePattern<linalg::TiledLoopOp> {
-  using OpRewritePattern<linalg::TiledLoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::TiledLoopOp tiledLoop,
-                                PatternRewriter &rewriter) const final {
-    SmallVector<Value, 2> newInputs, regionInputTensorArgs;
-    // Store ids of the corresponding old and new input operands.
-    SmallVector<int64_t, 2> oldInputIdToNew(tiledLoop.inputs().size(),
-                                            kNoMatch);
-    for (const auto &en : llvm::enumerate(
-             llvm::zip(tiledLoop.inputs(), tiledLoop.getRegionInputArgs()))) {
-      Value in, bbArg;
-      size_t index = en.index();
-      std::tie(in, bbArg) = en.value();
-      if (!bbArg.use_empty()) {
-        oldInputIdToNew[index] = newInputs.size();
-        newInputs.push_back(in);
-      }
-    }
-    if (newInputs.size() == tiledLoop.inputs().size())
-      return failure();
-    Location loc = tiledLoop.getLoc();
-    auto newTiledLoop = rewriter.create<TiledLoopOp>(
-        loc, tiledLoop.lowerBound(), tiledLoop.upperBound(), tiledLoop.step(),
-        newInputs, tiledLoop.outputs(), tiledLoop.iterator_types(),
-        tiledLoop.distribution_types());
-
-    // Clone the region.
-    BlockAndValueMapping bvm;
-    bvm.map(tiledLoop.getInductionVars(), newTiledLoop.getInductionVars());
-    bvm.map(tiledLoop.getRegionOutputArgs(),
-            newTiledLoop.getRegionOutputArgs());
-    for (const auto &en : llvm::enumerate(oldInputIdToNew))
-      if (en.value() != kNoMatch)
-        bvm.map(tiledLoop.getRegionInputArgs()[en.index()],
-                newTiledLoop.getRegionInputArgs()[en.value()]);
-    OpBuilder innerBuilder =
-        OpBuilder::atBlockEnd(newTiledLoop.getBody(), rewriter.getListener());
-    for (auto &op : *tiledLoop.getBody())
-      innerBuilder.clone(op, bvm);
-    rewriter.replaceOp(tiledLoop, newTiledLoop.getResults());
-
-    return success();
-  }
-};
-
-} // namespace
-
-/// A simple, conservative analysis to determine if the loop is shape
-/// conserving. I.e., the type of the arg-th yielded value is the same as the
-/// type of the corresponding basic block argument of the loop.
-/// Note: This function handles only simple cases. Expand as needed.
-static bool isShapePreserving(TiledLoopOp loopOp, int64_t arg) {
-  auto yieldOp = cast<YieldOp>(loopOp.getLoopBody().front().getTerminator());
-  if (yieldOp.values().empty())
-    // Tiled loop either has no outputs or is a "memref-based version". In
-    // either case, the loop is shape conserving.
-    return true;
-  assert(arg < static_cast<int64_t>(yieldOp.values().size()) &&
-         "arg is out of bounds");
-  Value value = yieldOp.values()[arg];
-  while (value) {
-    if (value == loopOp.getRegionOutputArgs()[arg])
-      return true;
-    OpResult opResult = value.dyn_cast<OpResult>();
-    if (!opResult)
-      return false;
-
-    using tensor::InsertSliceOp;
-    value = llvm::TypeSwitch<Operation *, Value>(opResult.getOwner())
-                .template Case<InsertSliceOp>(
-                    [&](InsertSliceOp op) { return op.dest(); })
-                .template Case<TiledLoopOp>([&](TiledLoopOp loopOp) {
-                  return isShapePreserving(loopOp, opResult.getResultNumber())
-                             ? loopOp.outputs()[opResult.getResultNumber()]
-                             : Value();
-                })
-                .Default([&](auto op) { return Value(); });
-  }
-  return false;
-}
-
-namespace {
-
-/// Fold dim(x) where `x` is an input/output argument of a TiledLoopOp block
-/// to dim(y) where `y` is the initial input/output value of the argument.
-///
-/// E.g.:
-/// %y = ... : tensor<...>
-/// linalg.tiled_loop ... ins(%x = %y : tensor<...>) {
-///   tensor.dim %x, %c0 : tensor<...>
-/// }
-///
-/// is folded to:
-/// %y = ... : tensor<...>
-/// linalg.tiled_loop ... ins(%x = %y : tensor<...>) {
-///   tensor.dim %y, %c0 : tensor<...>
-/// }
-///
-/// Note: Dim ops are folded only if it can be proven that the runtime type of
-/// the yielded value (in case of outputs) does not change with loop iterations.
-template <typename OpTy>
-struct DimOfTiledLoopInsOutsFolder : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy dimOp,
-                                PatternRewriter &rewriter) const final {
-    auto src = dimOp.source().template dyn_cast<BlockArgument>();
-    if (!src)
-      return failure();
-    auto loopOp =
-        dyn_cast<TiledLoopOp>(src.getOwner()->getParent()->getParentOp());
-    if (!loopOp)
-      return failure();
-    unsigned numLoops = loopOp.getNumLoops();
-    unsigned numInputArgs = loopOp.getRegionInputArgs().size();
-    if (src.getArgNumber() >= numInputArgs + numLoops &&
-        !isShapePreserving(loopOp,
-                           src.getArgNumber() - numInputArgs - numLoops))
-      return failure();
-
-    auto inputArgs = loopOp.getRegionInputArgs();
-    auto it1 = llvm::find(inputArgs, src);
-    if (it1 != inputArgs.end()) {
-      rewriter.updateRootInPlace(dimOp, [&] {
-        dimOp.sourceMutable().assign(loopOp.inputs()[it1 - inputArgs.begin()]);
-      });
-      return success();
-    }
-
-    auto outputArgs = loopOp.getRegionOutputArgs();
-    auto it2 = llvm::find(outputArgs, src);
-    if (it2 != outputArgs.end()) {
-      rewriter.updateRootInPlace(dimOp, [&] {
-        dimOp.sourceMutable().assign(
-            loopOp.outputs()[it2 - outputArgs.begin()]);
-      });
-      return success();
-    }
-
-    return failure();
-  }
-};
-
-/// Fold dim(r) where `r` is the result of a TiledLoopOp to dim(y) where `y`
-/// is the initial output value of the loop.
-///
-/// E.g.:
-/// %y = ... : tensor<...>
-/// %r = linalg.tiled_loop ... outs(%i = %y : tensor<...>) {
-///   ...
-/// }
-/// %0 = tensor.dim %r, %c0 : tensor<...>
-///
-/// is folded to:
-/// %y = ... : tensor<...>
-/// linalg.tiled_loop ... outs(%i = %y : tensor<...>) {
-///   ...
-/// }
-/// %0 = tensor.dim %y, %c0 : tensor<...>
-///
-/// Note: Dim ops are folded only if it can be proven that the runtime type of
-/// the yielded value (in case of outputs) does not change with loop iterations.
-template <typename OpTy>
-struct DimOfTiledLoopResultFolder : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy dimOp,
-                                PatternRewriter &rewriter) const final {
-    auto loopOp = dimOp.source().template getDefiningOp<TiledLoopOp>();
-    if (!loopOp)
-      return failure();
-    auto opResult = dimOp.source().template cast<OpResult>();
-    unsigned resultNumber = opResult.getResultNumber();
-    if (!isShapePreserving(loopOp, resultNumber))
-      return failure();
-    rewriter.updateRootInPlace(dimOp, [&]() {
-      dimOp.sourceMutable().assign(loopOp.outputs()[resultNumber]);
-    });
-    return success();
-  }
-};
-
-// Folds away TiledLoopOp output tensors when the following conditions are met:
-// * result of `linalg.tiled_loop` has no uses
-// * output tensor is the argument of `linalg.yield`
-//
-// Example:
-//
-// %0 = linalg.tiled_loop ...  outs (%o_ = %out: tensor<...>,
-//                                   %obuf_ = %out_buf: memref<...>) {
-//   ...
-//   linalg.yield %o_ : tensor ...
-// }
-//
-// Becomes
-//
-// linalg.tiled_loop ...  outs (%obuf_ = %out_buf: memref<...>) {
-//   ...
-//   linalg.yield
-// }
-struct TiledLoopResultsFolder : public OpRewritePattern<linalg::TiledLoopOp> {
-  using OpRewritePattern<linalg::TiledLoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::TiledLoopOp tiledLoop,
-                                PatternRewriter &rewriter) const final {
-    if (tiledLoop.getNumResults() == 0)
-      return failure();
-
-    Block *block = tiledLoop.getBody();
-    auto yieldOp = cast<linalg::YieldOp>(block->getTerminator());
-
-    // Match the pattern and collect output buffers that will replace the output
-    // tensors and also the ops that will be ignored when cloning the body.
-    SmallVector<Value, 2> newOutputOperands, newYieldArgs;
-    int resultId = 0;
-    // Store ids of the corresponding old and new output operands.
-    SmallVector<int64_t, 2> oldOutputIdToNew(tiledLoop.outputs().size(),
-                                             kNoMatch);
-    // Store ids of the corresponding old and new results.
-    SmallVector<int64_t, 2> oldResultIdToNew(tiledLoop.getNumResults(),
-                                             kNoMatch);
-    SmallVector<Value, 2> resultReplacement(tiledLoop.getNumResults());
-    for (const auto &en : llvm::enumerate(
-             llvm::zip(tiledLoop.outputs(), tiledLoop.getRegionOutputArgs()))) {
-      size_t index = en.index();
-      Value out = std::get<0>(en.value());
-      Value outRegionArg = std::get<1>(en.value());
-
-      if (!out.getType().isa<RankedTensorType>()) {
-        oldOutputIdToNew[index] = newOutputOperands.size();
-        newOutputOperands.push_back(out);
-        continue;
-      }
-      Value result = tiledLoop.getResult(resultId);
-      Value yieldArg = yieldOp.getOperand(resultId);
-      if (yieldArg != outRegionArg || !result.use_empty()) {
-        oldOutputIdToNew[index] = newOutputOperands.size();
-        oldResultIdToNew[resultId] = newYieldArgs.size();
-        resultReplacement[resultId] = out;
-        newOutputOperands.push_back(out);
-        newYieldArgs.push_back(yieldArg);
-      }
-      ++resultId;
-    }
-    if (newOutputOperands.size() == tiledLoop.outputs().size())
-      return failure();
-
-    Location loc = tiledLoop.getLoc();
-    auto newTiledLoop = rewriter.create<TiledLoopOp>(
-        loc, tiledLoop.lowerBound(), tiledLoop.upperBound(), tiledLoop.step(),
-        tiledLoop.inputs(), newOutputOperands, tiledLoop.iterator_types(),
-        tiledLoop.distribution_types());
-
-    // Clone the region.
-    BlockAndValueMapping bvm;
-    bvm.map(tiledLoop.getInductionVars(), newTiledLoop.getInductionVars());
-    bvm.map(tiledLoop.getRegionInputArgs(), newTiledLoop.getRegionInputArgs());
-    for (const auto &en : llvm::enumerate(oldOutputIdToNew)) {
-      if (en.value() != kNoMatch)
-        bvm.map(tiledLoop.getRegionOutputArgs()[en.index()],
-                newTiledLoop.getRegionOutputArgs()[en.value()]);
-      else
-        bvm.map(tiledLoop.getRegionOutputArgs()[en.index()],
-                tiledLoop.outputs()[en.index()]);
-    }
-    OpBuilder innerBuilder =
-        OpBuilder::atBlockEnd(newTiledLoop.getBody(), rewriter.getListener());
-    for (auto &op : tiledLoop.getBody()->without_terminator())
-      innerBuilder.clone(op, bvm);
-    innerBuilder.create<linalg::YieldOp>(
-        loc, llvm::to_vector<2>(llvm::map_range(
-                 newYieldArgs, [&](Value arg) { return bvm.lookup(arg); })));
-
-    for (const auto &en : llvm::enumerate(oldResultIdToNew))
-      if (en.value() != kNoMatch)
-        resultReplacement[en.index()] = newTiledLoop.getResult(en.value());
-    rewriter.replaceOp(tiledLoop, resultReplacement);
-
-    return success();
-  }
-};
-} // namespace
-
-void TiledLoopOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.add<TiledLoopInputsFolder, TiledLoopResultsFolder,
-              DimOfTiledLoopInsOutsFolder<tensor::DimOp>,
-              DimOfTiledLoopInsOutsFolder<memref::DimOp>,
-              DimOfTiledLoopResultFolder<tensor::DimOp>,
-              DimOfTiledLoopResultFolder<memref::DimOp>>(context);
-}
-
-LogicalResult TiledLoopOp::fold(ArrayRef<Attribute>,
-                                SmallVectorImpl<OpFoldResult> &) {
-  return foldMemRefCastInTiledLoopOp(*this);
-}
-
 //===----------------------------------------------------------------------===//
 // IndexOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
index 799c13726091c..b9d1fc2c29f71 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -246,203 +246,6 @@ struct InitTensorOpInterface
   }
 };
 
-/// Bufferization of linalg.tiled_loop. Replace with a new linalg.tiled_loop
-/// that operates entirely on memrefs.
-struct TiledLoopOpInterface
-    : public BufferizableOpInterface::ExternalModel<TiledLoopOpInterface,
-                                                    linalg::TiledLoopOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const BufferizationState &state) const {
-    auto tiledLoopOp = cast<linalg::TiledLoopOp>(op);
-
-    // linalg.tiled_loop operands alone do not bufferize to a memory read, but
-    // one of the uses of their matching bbArgs may.
-    return state.isValueRead(tiledLoopOp.getTiedBlockArgument(opOperand));
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const BufferizationState &state) const {
-    auto bufferizableOp = cast<BufferizableOpInterface>(op);
-
-    // Only operands with an aliasing OpResult (i.e., output operands) bufferize
-    // to a memory write.
-    return !bufferizableOp.getAliasingOpResult(opOperand, state).empty();
-  }
-
-  SmallVector<OpResult>
-  getAliasingOpResult(Operation *op, OpOperand &opOperand,
-                      const BufferizationState &state) const {
-    auto tiledLoopOp = cast<linalg::TiledLoopOp>(op);
-
-    // Output operands are tied to their corresponding OpResults.
-    OpResult opResult = tiledLoopOp.getTiedOpResult(opOperand);
-    if (!opResult)
-      return {};
-    return {opResult};
-  }
-
-  BufferRelation bufferRelation(Operation *op, OpResult opResult,
-                                const BufferizationState &state) const {
-    return BufferRelation::Equivalent;
-  }
-
-  bool isWritable(Operation *op, Value value,
-                  const BufferizationState &state) const {
-    // Interestingly, linalg::TiledLoopOp's bbArgs can **always** be viewed
-    // inplace from the perspective of nested ops:
-    //   1. Either the matching iter operand is not bufferized inplace and an
-    //      alloc + optional copy makes the bbArg itself inplaceable.
-    //   2. Or the matching iter operand is bufferized inplace and bbArg just
-    //      bufferizes to that too.
-    return true;
-  }
-
-  bool isAllocationHoistingBarrier(Operation *op) const { return true; }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationState &state) const {
-    auto tiledLoopOp = cast<linalg::TiledLoopOp>(op);
-
-    // Compute new inputs, outputs and results.
-    SmallVector<Value> newInputs, newOutputs, newResults;
-    for (unsigned i = tiledLoopOp.getNumControlOperands();
-         i < tiledLoopOp->getNumOperands(); ++i) {
-      OpOperand &operand = tiledLoopOp->getOpOperand(i);
-      Value rewrittenValue = operand.get();
-      if (rewrittenValue.getType().isa<TensorType>()) {
-        FailureOr<Value> bufferOrFailure = state.getBuffer(rewriter, operand);
-        if (failed(bufferOrFailure))
-          return failure();
-        rewrittenValue = *bufferOrFailure;
-      }
-      if (i <
-          tiledLoopOp.getNumControlOperands() + tiledLoopOp.getNumInputs()) {
-        newInputs.push_back(rewrittenValue);
-      } else {
-        newOutputs.push_back(rewrittenValue);
-        if (operand.get().getType().isa<TensorType>())
-          newResults.push_back(rewrittenValue);
-      }
-    }
-
-    // Create new TiledLoopOp.
-    auto newTiledLoopOp = rewriter.create<TiledLoopOp>(
-        tiledLoopOp.getLoc(), tiledLoopOp.lowerBound(),
-        tiledLoopOp.upperBound(), tiledLoopOp.step(), newInputs, newOutputs,
-        tiledLoopOp.iterator_types(), tiledLoopOp.distribution_types());
-
-    // Remove terminator.
-    if (!newTiledLoopOp.getBody()->empty())
-      rewriter.eraseOp(tiledLoopOp.getBody()->getTerminator());
-
-    // Compute new loop body arguments.
-    SmallVector<Value> newBlockArgs, newRegionInOutArgs, oldRegionInOutArgs;
-    ValueRange newInductionVars = newTiledLoopOp.getInductionVars();
-    newBlockArgs.append(newInductionVars.begin(), newInductionVars.end());
-
-    ValueRange newRegionInArgs = newTiledLoopOp.getRegionInputArgs();
-    ValueRange newRegionOutArgs = newTiledLoopOp.getRegionOutputArgs();
-    newRegionInOutArgs.append(newRegionInArgs.begin(), newRegionInArgs.end());
-    newRegionInOutArgs.append(newRegionOutArgs.begin(), newRegionOutArgs.end());
-
-    ValueRange oldRegionInArgs = tiledLoopOp.getRegionInputArgs();
-    ValueRange oldRegionOutArgs = tiledLoopOp.getRegionOutputArgs();
-    oldRegionInOutArgs.append(oldRegionInArgs.begin(), oldRegionInArgs.end());
-    oldRegionInOutArgs.append(oldRegionOutArgs.begin(), oldRegionOutArgs.end());
-    assert(newRegionInArgs.size() == oldRegionInArgs.size() &&
-           "expected same number of input args");
-    assert(newRegionOutArgs.size() == oldRegionOutArgs.size() &&
-           "expected same number of output args");
-
-    for (auto it : llvm::zip(oldRegionInOutArgs, newRegionInOutArgs)) {
-      Value oldArg = std::get<0>(it);
-      Value newArg = std::get<1>(it);
-      rewriter.setInsertionPointToStart(newTiledLoopOp.getBody());
-      if (oldArg.getType().isa<TensorType>()) {
-        newBlockArgs.push_back(rewriter.create<bufferization::ToTensorOp>(
-            oldArg.getLoc(), newArg));
-      } else {
-        newBlockArgs.push_back(newArg);
-      }
-    }
-
-    // Move old body into new loop.
-    rewriter.mergeBlocks(tiledLoopOp.getBody(), newTiledLoopOp.getBody(),
-                         newBlockArgs);
-
-    // Replace previous terminator with a new one that does not yield anything.
-    auto oldTerminator =
-        cast<linalg::YieldOp>(newTiledLoopOp.getBody()->getTerminator());
-    rewriter.setInsertionPointToEnd(newTiledLoopOp.getBody());
-    auto newTerminator =
-        rewriter.create<linalg::YieldOp>(oldTerminator->getLoc());
-
-    // Copy buffer of yielded tensor to output buffer. If everything bufferized
-    // inplace, this copy will fold away.
-    rewriter.setInsertionPoint(newTerminator);
-    for (auto it : llvm::zip(oldTerminator.values(), newOutputs)) {
-      Value output = std::get<1>(it);
-      Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
-          newTerminator.getLoc(), output.getType(), std::get<0>(it));
-      if (failed(createMemCpy(rewriter, newTerminator.getLoc(), toMemrefOp,
-                              output, state.getOptions())))
-        return failure();
-    }
-
-    // Erase old terminator.
-    rewriter.eraseOp(oldTerminator);
-
-    // Replace results and delete old op.
-    replaceOpWithBufferizedValues(rewriter, op, newResults);
-
-    return success();
-  }
-};
-
-/// Bufferization of linalg.yield. Bufferized as part of linalg.tiled_loop's
-/// bufferization.
-struct YieldOpInterface
-    : public BufferizableOpInterface::ExternalModel<YieldOpInterface,
-                                                    linalg::YieldOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const BufferizationState &state) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const BufferizationState &state) const {
-    return false;
-  }
-
-  SmallVector<OpResult>
-  getAliasingOpResult(Operation *op, OpOperand &opOperand,
-                      const BufferizationState &state) const {
-    return {};
-  }
-
-  bool mustBufferizeInPlace(Operation *op, OpOperand &opOperand,
-                            const BufferizationState &state) const {
-    // Yield operands always bufferize inplace. Otherwise, an alloc + copy
-    // may be generated inside the block. We should not return/yield allocations
-    // when possible.
-    return true;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationState &state) const {
-    auto yieldOp = cast<linalg::YieldOp>(op);
-
-    if (!yieldOp->getParentOfType<TiledLoopOp>())
-      return yieldOp->emitError(
-          "expected that linalg.yield terminates a tiled_loop");
-
-    assert(yieldOp->getOpOperands().empty() &&
-           "expected that linalg.yield was bufferized together with"
-           " tiled_loop");
-    return success();
-  }
-};
-
 /// Helper structure that iterates over all LinalgOps in `OpTys` and registers
 /// the `BufferizableOpInterface` with each of them.
 template <typename... OpTys>
@@ -701,8 +504,6 @@ LogicalResult mlir::linalg::insertSliceAnchoredInitTensorEliminationStep(
 void mlir::linalg::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addOpInterface<linalg::InitTensorOp, InitTensorOpInterface>();
-  registry.addOpInterface<linalg::TiledLoopOp, TiledLoopOpInterface>();
-  registry.addOpInterface<linalg::YieldOp, YieldOpInterface>();
 
   // Register all Linalg structured ops. `LinalgOp` is an interface and it is
   // not possible to attach an external interface to an existing interface.

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index ec8c8c438635e..a688eb59a6f12 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -4,7 +4,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   CodegenStrategy.cpp
   ComprehensiveBufferizePass.cpp
   Detensorize.cpp
-  Distribution.cpp
   DropUnitDims.cpp
   ElementwiseOpFusion.cpp
   ElementwiseToLinalg.cpp

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp b/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
deleted file mode 100644
index 692df291b2f66..0000000000000
--- a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- Distibution.cpp - linalg named ops to generic ops  --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Linalg distibution pass. It updates `tiled_loop`
-// control variables depending on the distribution type.
-//
-//===----------------------------------------------------------------------===//
-//
-#include <utility>
-
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-#define DEBUG_TYPE "linalg-distribution"
-
-#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
-
-using namespace mlir;
-using namespace mlir::linalg;
-
-namespace {
-
-struct DistributeTiledLoopPattern
-    : public OpRewritePattern<linalg::TiledLoopOp> {
-  DistributeTiledLoopPattern(MLIRContext *context,
-                             LinalgLoopDistributionOptions options,
-                             LinalgTransformationFilter marker)
-      : OpRewritePattern<linalg::TiledLoopOp>(context),
-        options(std::move(options)), marker(std::move(marker)) {}
-  LogicalResult matchAndRewrite(linalg::TiledLoopOp op,
-                                PatternRewriter &rewriter) const override {
-    if (failed(marker.checkAndNotify(rewriter, op)))
-      return failure();
-    if (!op.distribution_types().hasValue())
-      return failure();
-
-    Location loc = op.getLoc();
-    SmallVector<Value, 2> newLowerBounds = op.lowerBound();
-    SmallVector<Value, 2> newUpperBounds = op.upperBound();
-    SmallVector<Value, 2> newSteps = op.step();
-
-    // Update bounds and steps.
-    auto distributionTypes = op.distribution_types().getValue();
-    for (int i = 0, e = op.getNumLoops(); i < e; ++i) {
-      StringRef type = distributionTypes[i].cast<StringAttr>().getValue();
-      auto procInfoCallback = options.procInfoMap.find(type);
-      if (procInfoCallback == options.procInfoMap.end())
-        continue;
-
-      if (!isParallelIterator(op.iterator_types()[i])) {
-        op.emitOpError("only support for parallel loops is implemented");
-        return failure();
-      }
-      ProcInfo info = procInfoCallback->second(rewriter, loc);
-      updateBoundsForCyclicDistribution(rewriter, loc, info.procId, info.nprocs,
-                                        newLowerBounds[i], newUpperBounds[i],
-                                        newSteps[i]);
-    }
-    rewriter.updateRootInPlace(op, [&] {
-      op.setLowerBounds(newLowerBounds);
-      op.setUpperBounds(newUpperBounds);
-      op.setSteps(newSteps);
-    });
-    marker.replaceLinalgTransformationFilter(rewriter, op);
-    return success();
-  }
-
-private:
-  LinalgLoopDistributionOptions options;
-  LinalgTransformationFilter marker;
-};
-
-} // namespace
-
-void mlir::linalg::populateLinalgDistributeTiledLoopPattern(
-    RewritePatternSet &patterns, const LinalgLoopDistributionOptions &opts,
-    const LinalgTransformationFilter &marker) {
-  patterns.add<DistributeTiledLoopPattern>(patterns.getContext(), opts, marker);
-}

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
index 32d8ee098bcec..94edb8b630876 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -104,63 +104,8 @@ getShapeDefiningLoopRange(LinalgOp op, unsigned loopDepth,
   llvm_unreachable("Expect to be able to extract a shape defining loop range");
 }
 
-// Return tiled operands for the fused producer op. When fusing into
-// `linalg.tiled_loop` one has to update `input` and `output` arguments of the
-// loop correspondingly.
-// Each input tensor of the producer op has to be added to `inputs` of the
-// `tiled_loop` if it is not present there already. Each output tensor has to
-// be added either to `inputs` or to `outputs` of `linalg.tiled_loop` depending
-// on whether the correponding result is an input or an output to the loop.
-//
-// NOTE: This way of updating the arguments of the `tiled_loop` assumes that the
-// intermediate result is not used by any other operation but the consumer. A
-// more generic way is to append all missing output tensors of the producer to
-// the tiled loop outputs and hence modify the number of the results, since we
-// would need to add the intermediate results to `linalg.yield`. After that a
-// canonicalization pass would move the unused output args of the `tiled_loop`
-// to the `input` section.
-static SmallVector<Value> getTiledOperands(OpBuilder &b, LinalgOp producer) {
-  auto tiledLoop = dyn_cast<TiledLoopOp>(b.getBlock()->getParentOp());
-  if (!tiledLoop)
-    return producer.getInputAndOutputOperands();
-
-  SmallVector<Value> tiledOperands;
-  assert(producer.hasTensorSemantics() &&
-         "only fusion on tensors is currently supported for TiledLinalgOp");
-
-  for (OpOperand *producerInput : producer.getInputOperands()) {
-    OpOperand *addedInput = tiledLoop.findInputOperand(producerInput->get());
-    if (addedInput == nullptr)
-      addedInput = &tiledLoop.appendInputOperand(b, producerInput->get());
-    BlockArgument addedBlockArg = tiledLoop.getTiedBlockArgument(*addedInput);
-    tiledOperands.push_back(addedBlockArg);
-  }
-  for (OpOperand *producerOutput : producer.getOutputOperands()) {
-    OpResult result = producer.getTiedOpResult(producerOutput);
-    OpOperand *resultInputOperand = tiledLoop.findInputOperand(result);
-    OpOperand *resultOutputOperand = tiledLoop.findOutputOperand(result);
-    assert((resultInputOperand != nullptr) ^ (resultOutputOperand != nullptr) &&
-           "The result should be present in `input` or `output` args of "
-           "`tiled_loop");
-
-    bool isInput = resultInputOperand;
-    int opNumber = isInput ? resultInputOperand->getOperandNumber()
-                           : resultOutputOperand->getOperandNumber();
-
-    OpOperand *addedOutput = tiledLoop.findOutputOperand(producerOutput->get());
-    if (addedOutput == nullptr)
-      addedOutput =
-          isInput ? &tiledLoop.appendInputOperand(b, producerOutput->get())
-                  : &tiledLoop.appendOutputOperand(b, producerOutput->get());
-
-    OpOperand &resultOperand = tiledLoop->getOpOperand(opNumber);
-    auto addedBlockArg = tiledLoop.getTiedBlockArgument(*addedOutput);
-    auto resultOperandBlockArg = tiledLoop.getTiedBlockArgument(resultOperand);
-    resultOperandBlockArg.replaceAllUsesWith(addedBlockArg);
-    tiledLoop.eraseOperand(b, resultOperand);
-    tiledOperands.push_back(addedBlockArg);
-  }
-  return tiledOperands;
+static SmallVector<Value> getTiledOperands(LinalgOp producer) {
+  return producer.getInputAndOutputOperands();
 }
 
 /// Fuses the producer by cloning the `producer`. The `fusedLoopsAndRanges`
@@ -198,7 +143,7 @@ static LinalgOp fuse(OpBuilder &b, LinalgOp producer,
 
   // Compute subranges for all tensor input/output operands.
   clonedShapes.append(makeTiledShapes(b, loc, producer,
-                                      getTiledOperands(b, producer), ivs,
+                                      getTiledOperands(producer), ivs,
                                       tileSizes, sizeBounds));
 
   // Iterate over the results in order.

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index c9e3c6c955703..5a5554992341f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -260,72 +260,6 @@ class LinalgRewritePattern : public RewritePattern {
   }
 };
 
-/// Converts tiled_loop to SCF loop nests. All parallel dimensions are collected
-/// into an scf.parallel loop and all sequential dimensions will result in the
-/// nested scf.for loop nest. The pattern assumes that a tiled loop with
-/// iterator_types ["reduction", "parallel", "reduction"] can be reordered. It
-/// is true for the tiling that is currently suppported by Linalg.
-struct TiledLoopToSCFPattern : public OpRewritePattern<TiledLoopOp> {
-  using OpRewritePattern<TiledLoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TiledLoopOp tiledLoop,
-                                PatternRewriter &rewriter) const override {
-    // Fail conversion if the `tiled_loop` has not been bufferized.
-    if (!tiledLoop.hasBufferSemantics())
-      return failure();
-
-    // Collect loop control parameters for parallel and sequential dimensions.
-    SmallVector<Value, 3> seqLBs, seqUBs, seqSteps, seqIVs;
-    SmallVector<Value, 3> parLBs, parUBs, parSteps, parIVs;
-    for (const auto &en : llvm::enumerate(
-             llvm::zip(tiledLoop.lowerBound(), tiledLoop.upperBound(),
-                       tiledLoop.step(), tiledLoop.getInductionVars()))) {
-      Value lb, ub, step, iv;
-      std::tie(lb, ub, step, iv) = en.value();
-      if (tiledLoop.isParallelDimension(en.index())) {
-        parLBs.push_back(lb);
-        parUBs.push_back(ub);
-        parSteps.push_back(step);
-        parIVs.push_back(iv);
-      } else {
-        seqLBs.push_back(lb);
-        seqUBs.push_back(ub);
-        seqSteps.push_back(step);
-        seqIVs.push_back(iv);
-      }
-    }
-
-    Location loc = tiledLoop.getLoc();
-    auto generateForLoopNestAndCloneBody = [&](OpBuilder &builder, Location loc,
-                                               ValueRange ivs) {
-      BlockAndValueMapping bvm;
-      bvm.map(parIVs, ivs);
-      bvm.map(tiledLoop.getRegionInputArgs(), tiledLoop.inputs());
-      bvm.map(tiledLoop.getRegionOutputArgs(), tiledLoop.outputs());
-
-      // If not all dimensions of the tiled loop are parallel, an scf.for loop
-      // nest is generated.
-      if (!seqIVs.empty()) {
-        scf::LoopNest nest =
-            scf::buildLoopNest(builder, loc, seqLBs, seqUBs, seqSteps,
-                               [&](OpBuilder &builder, Location loc,
-                                   ValueRange ivs) { bvm.map(seqIVs, ivs); });
-        builder.setInsertionPointToStart(nest.loops.back().getBody());
-      }
-      for (auto &op : tiledLoop.getBody()->without_terminator())
-        builder.clone(op, bvm);
-    };
-
-    if (parIVs.empty())
-      generateForLoopNestAndCloneBody(rewriter, loc, llvm::None);
-    else
-      rewriter.create<scf::ParallelOp>(loc, parLBs, parUBs, parSteps,
-                                       generateForLoopNestAndCloneBody);
-    rewriter.eraseOp(tiledLoop);
-    return success();
-  }
-};
-
 /// Local folding pattern for AffineApplyOp that we can apply greedily.
 /// This replaces AffineApplyOp by the proper value in cases where the
 /// associated map is trivial.
@@ -402,136 +336,8 @@ struct LowerToParallelLoops
   }
 };
 
-struct LowerTiledLoopsToSCF
-    : public LinalgLowerTiledLoopsToSCFBase<LowerTiledLoopsToSCF> {
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(context);
-    populateTiledLoopToSCFPattern(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
-  }
-};
 } // namespace
 
-/// Rewrite a TiledLoopOp with bounds/step that potentially do not divide evenly
-/// into two TiledLoopOps: One where the step divides the iteration space
-/// evenly, followed another one for the last (partial) iteration (if any). This
-/// function only rewrites the `idx`-th loop of the loop nest represented by
-/// the TiledLoopOp. To peel the entire loop nest, this function must be called
-/// multiple times.
-///
-/// This function rewrites the given TiledLoopOp in-place and creates a new
-/// TiledLoopOp for the last iteration. It replaces all uses of the original
-/// TiledLoopOp with the results of the newly generated one.
-///
-/// The newly generated TiledLoopOp is returned via `result`. The boundary
-/// at which the loop is split (new upper bound) is returned via `splitBound`.
-/// The return value indicates whether the TiledLoopOp was rewritten or not.
-static LogicalResult peelTiledLoop(RewriterBase &b, TiledLoopOp loopOp,
-                                   int64_t idx, TiledLoopOp &result,
-                                   Value &splitBound) {
-  Value lb = loopOp.lowerBound()[idx], ub = loopOp.upperBound()[idx],
-        step = loopOp.step()[idx];
-  auto ubInt = getConstantIntValue(ub);
-
-  auto loc = loopOp.getLoc();
-  AffineExpr exprLb, exprUb, exprStep;
-  bindSymbols(b.getContext(), exprLb, exprUb, exprStep);
-  // New upper bound: %ub - (%ub - %lb) mod %step
-  auto modMap = AffineMap::get(0, 3, {exprUb - ((exprUb - exprLb) % exprStep)});
-  SmallVector<Value> operands{lb, ub, step};
-  mlir::canonicalizeMapAndOperands(&modMap, &operands);
-  modMap = mlir::simplifyAffineMap(modMap);
-  RewriterBase::InsertionGuard guard(b);
-  b.setInsertionPoint(loopOp);
-  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, operands);
-  // No specialization necessary if step already divides upper bound evenly.
-  if (splitBound == ub || (ubInt && ubInt == getConstantIntValue(splitBound)))
-    return failure();
-
-  // Create remainder loop.
-  b.setInsertionPointAfter(loopOp);
-  auto remainderLoop = cast<TiledLoopOp>(b.clone(*loopOp.getOperation()));
-  loopOp.replaceAllUsesWith(remainderLoop->getResults());
-  // Outputs: Take tensors from main loop's results. Take memrefs from main
-  // loop's outputs.
-  SmallVector<Value> remainderOutputs;
-  for (unsigned o = 0, t = 0; o < loopOp.getNumOutputs(); ++o) {
-    remainderOutputs.push_back(loopOp.outputs()[o].getType().isa<MemRefType>()
-                                   ? loopOp.outputs()[o]
-                                   : loopOp->getResult(t++));
-  }
-  remainderLoop.outputsMutable().assign(remainderOutputs);
-
-  // Set new loop bounds.
-  b.updateRootInPlace(loopOp, [&]() {
-    SmallVector<Value> ubs = loopOp.upperBound();
-    ubs[idx] = splitBound;
-    loopOp.upperBoundMutable().assign(ubs);
-  });
-  SmallVector<Value> lbs = remainderLoop.lowerBound();
-  lbs[idx] = splitBound;
-  remainderLoop.lowerBoundMutable().assign(lbs);
-
-  result = remainderLoop;
-  return success();
-}
-
-template <typename OpTy, bool IsMin>
-static void
-rewriteAffineOpAfterPeeling(RewriterBase &rewriter, TiledLoopOp mainLoop,
-                            TiledLoopOp remainderLoop, Value mainIv,
-                            Value remainderIv, Value ub, Value step) {
-  mainLoop.walk([&](OpTy affineOp) {
-    AffineMap map = affineOp.getAffineMap();
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, mainIv, ub,
-                                     step, /*insideLoop=*/true);
-  });
-  remainderLoop.walk([&](OpTy affineOp) {
-    AffineMap map = affineOp.getAffineMap();
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, remainderIv,
-                                     ub, step, /*insideLoop=*/false);
-  });
-}
-
-LogicalResult mlir::linalg::peelAndCanonicalizeTiledLoop(RewriterBase &rewriter,
-                                                         TiledLoopOp loopOp,
-                                                         int64_t idx,
-                                                         TiledLoopOp &result) {
-  int64_t numLoops = loopOp.iterator_types().size();
-  if (idx < 0 || numLoops <= idx)
-    return failure();
-
-  Value ub = loopOp.upperBound()[idx];
-  TiledLoopOp remainderLoop;
-  Value splitBound;
-  if (failed(peelTiledLoop(rewriter, loopOp, idx, remainderLoop, splitBound)))
-    return failure();
-
-  // Rewrite affine.min and affine.max ops.
-  Value mainIv = loopOp.getInductionVars()[idx], step = loopOp.step()[idx],
-        remainderIv = remainderLoop.getInductionVars()[idx];
-
-  rewriteAffineOpAfterPeeling<AffineMinOp, /*IsMin=*/true>(
-      rewriter, loopOp, remainderLoop, mainIv, remainderIv, ub, step);
-  rewriteAffineOpAfterPeeling<AffineMaxOp, /*IsMin=*/false>(
-      rewriter, loopOp, remainderLoop, mainIv, remainderIv, ub, step);
-
-  result = remainderLoop;
-  return success();
-}
-
-void mlir::linalg::populateTiledLoopToSCFPattern(RewritePatternSet &patterns) {
-  patterns.add<TiledLoopToSCFPattern>(patterns.getContext());
-}
-
-std::unique_ptr<OperationPass<FuncOp>>
-mlir::createConvertLinalgTiledLoopsToSCFPass() {
-  return std::make_unique<LowerTiledLoopsToSCF>();
-}
-
 std::unique_ptr<OperationPass<FuncOp>> mlir::createConvertLinalgToLoopsPass() {
   return std::make_unique<LowerToLoops>();
 }

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 0271857383746..2e1418c529a25 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -271,8 +271,6 @@ mlir::linalg::tileLinalgOp(RewriterBase &b, LinalgOp op,
     return tileLinalgOpImpl<scf::ForOp>(b, op, options);
   case LinalgTilingLoopType::ParallelLoops:
     return tileLinalgOpImpl<scf::ParallelOp>(b, op, options);
-  case LinalgTilingLoopType::TiledLoops:
-    return tileLinalgOpImpl<linalg::TiledLoopOp>(b, op, options);
   default:;
   }
   return failure();
@@ -453,13 +451,10 @@ static void applyExtractSliceOfPadTensorSwapPattern(FuncOp funcOp) {
 namespace {
 struct LinalgTilingPass : public LinalgTilingBase<LinalgTilingPass> {
   LinalgTilingPass() = default;
-  LinalgTilingPass(ArrayRef<int64_t> tileSizes, LinalgTilingLoopType loopType,
-                   ArrayRef<StringRef> distributionTypes) {
+  LinalgTilingPass(ArrayRef<int64_t> tileSizes, LinalgTilingLoopType loopType) {
     this->tileSizes = tileSizes;
     this->loopType = "";
     this->loopTypeEnum = loopType;
-    this->distributionTypes = llvm::to_vector<2>(llvm::map_range(
-        distributionTypes, [](StringRef ref) { return ref.str(); }));
   }
 
   void runOnOperation() override {
@@ -469,14 +464,9 @@ struct LinalgTilingPass : public LinalgTilingBase<LinalgTilingPass> {
             .Case("for", LinalgTilingLoopType::Loops)
             .Case("affine", LinalgTilingLoopType::AffineLoops)
             .Case("parallel", LinalgTilingLoopType::ParallelLoops)
-            .Case("tiled_loop", LinalgTilingLoopType::TiledLoops)
             .Default(loopTypeEnum);
-    auto distTypes = llvm::to_vector<2>(llvm::map_range(
-        distributionTypes, [](std::string &str) { return StringRef(str); }));
-    auto options = LinalgTilingOptions()
-                       .setTileSizes(tileSizes)
-                       .setLoopType(type)
-                       .setDistributionTypes(distTypes);
+    auto options =
+        LinalgTilingOptions().setTileSizes(tileSizes).setLoopType(type);
     MLIRContext *ctx = funcOp.getContext();
     RewritePatternSet patterns(ctx);
     insertTilingPatterns(patterns, options);
@@ -501,8 +491,6 @@ struct LinalgTilingPass : public LinalgTilingBase<LinalgTilingPass> {
 
 std::unique_ptr<OperationPass<FuncOp>>
 mlir::createLinalgTilingPass(ArrayRef<int64_t> tileSizes,
-                             linalg::LinalgTilingLoopType loopType,
-                             ArrayRef<StringRef> distributionTypes) {
-  return std::make_unique<LinalgTilingPass>(tileSizes, loopType,
-                                            distributionTypes);
+                             linalg::LinalgTilingLoopType loopType) {
+  return std::make_unique<LinalgTilingPass>(tileSizes, loopType);
 }

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 9f177f0a1b92b..f6a5304e1cff9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -299,18 +299,6 @@ static SmallVector<Value, 4> peelLoop(RewriterBase &rewriter, Operation *op) {
       .Default([&](Operation *op) { return op->getResults(); });
 }
 
-/// Try to peel a TiledLoopOp and return the new result.
-static SmallVector<Value, 4> peelLoop(RewriterBase &rewriter,
-                                      TiledLoopOp tiledLoop, int64_t idx) {
-  assert(idx < static_cast<int64_t>(tiledLoop.iterator_types().size()) &&
-         "requested peeling of non-existing loop");
-  TiledLoopOp result;
-  if (succeeded(peelAndCanonicalizeTiledLoop(rewriter, tiledLoop, idx, result)))
-    return result->getResults();
-  assert(!result && "expected that loop was not peeled");
-  return tiledLoop->getResults();
-}
-
 /// Peel loops after tiling.
 void mlir::linalg::peelTiledLinalgOp(RewriterBase &rewriter, TiledLinalgOp &res,
                                      ArrayRef<int64_t> peeledLoops,
@@ -320,17 +308,7 @@ void mlir::linalg::peelTiledLinalgOp(RewriterBase &rewriter, TiledLinalgOp &res,
            "requested peeling of non-existing loop");
     SmallVector<Value, 4> loopResults;
     Operation *loopOp = res.loops[loop];
-    if (loopType == LinalgTilingLoopType::TiledLoops) {
-      assert(llvm::all_of(
-                 res.loops,
-                 [&](Operation *op) { return op == res.loops.front(); }) &&
-             "expected that all loop ops are the same TiledLoopOp");
-      auto tiledLoopOp = dyn_cast<TiledLoopOp>(loopOp);
-      assert(tiledLoopOp && "expected TiledLoopOp");
-      loopResults = peelLoop(rewriter, tiledLoopOp, loop);
-    } else {
-      loopResults = peelLoop(rewriter, loopOp);
-    }
+    loopResults = peelLoop(rewriter, loopOp);
 
     // The result of the loop nest may change with peeling.
     if (res.tensorResults.size() == loopOp->getNumResults() &&

diff  --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 98a62a0f3cd6f..3dfb336a9bf17 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -125,7 +125,6 @@ RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
 template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
-template struct mlir::linalg::GenerateLoopNest<TiledLoopOp>;
 
 /// Given a list of subview ranges, extract individual values for lower, upper
 /// bounds and steps and put them into the corresponding vectors.
@@ -537,39 +536,6 @@ void GenerateLoopNest<AffineForOp>::doit(
                             });
 }
 
-/// Specialization to build an linalg.tiled_loop
-template <>
-void GenerateLoopNest<TiledLoopOp>::doit(
-    OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,
-    ArrayRef<Attribute> iteratorTypes,
-    function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
-                                  ValueRange)>
-        bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions> distributionOptions,
-    ArrayRef<StringRef> distributionTypes) {
-  SmallVector<ProcInfo, 2> procInfo;
-  SmallVector<Value, 4> lbs, ubs, steps;
-  unpackRanges(loopRanges, lbs, ubs, steps);
-
-  auto wrappedBuilderFn = [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                              ValueRange ivs, ValueRange inputs,
-                              ValueRange outputs) {
-    SmallVector<Value> operandValuesToUse = inputs;
-    operandValuesToUse.append(outputs.begin(), outputs.end());
-    scf::ValueVector results =
-        bodyBuilderFn(nestedBuilder, nestedLoc, ivs, operandValuesToUse);
-    nestedBuilder.create<linalg::YieldOp>(nestedLoc, results);
-  };
-
-  SmallVector<Value> inputOperands = linalgOp.getInputOperands();
-  SmallVector<Value> outputOperands = linalgOp.getOutputOperands();
-  auto tiledLoop =
-      b.create<TiledLoopOp>(loc, lbs, ubs, steps, inputOperands, outputOperands,
-                            b.getArrayAttr(iteratorTypes), wrappedBuilderFn);
-  if (!distributionTypes.empty())
-    tiledLoop.setDistributionTypes(b, distributionTypes);
-}
-
 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
 void updateBoundsForCyclicDistribution(OpBuilder &b, Location loc, Value procId,
                                        Value nprocs, Value &lb, Value &ub,

diff  --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index c3405887431ff..e3f213f8cd6ef 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -18,31 +18,6 @@ func @memref_cast(%a: index, %b: index) -> memref<?x?xf32> {
 
 // -----
 
-#map = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @memref_cast_into_tiled_loop(
-func @memref_cast_into_tiled_loop(%arg0: memref<192xf32>)  {
-  %0 = memref.cast %arg0
-    : memref<192xf32> to memref<192xf32, #map>
-  %cst = arith.constant 0.000000e+00 : f32
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  // CHECK: linalg.tiled_loop
-  // CHECK-SAME: outs (%{{.*}} = %{{.*}}: memref<192xf32>)
-  linalg.tiled_loop (%arg3) = (%c0) to (%c192) step (%c24)
-    outs (%out = %0: memref<192xf32, #map>) {
-    %14 = affine.min affine_map<(d0) -> (-d0 + 192, 24)>(%arg3)
-    %16 = memref.subview %out[%arg3] [%14] [1]
-      : memref<192xf32, #map> to memref<?xf32, #map>
-    linalg.fill(%cst, %16) : f32, memref<?xf32, #map>
-    linalg.yield
-  }
-  return
-}
-
-// -----
-
 #accesses = [
   affine_map<(i) -> (i)>
 ]
@@ -368,70 +343,6 @@ func @fold_fill_reshape_dynamic(%arg0 : tensor<?x?x?x?x?xf32>) -> tensor<?x?xf32
 }
 
 
-// -----
-
-func private @foo(%A: memref<48xf32>, %B: tensor<48xf32>,
-                  %C: memref<48xf32>) -> (tensor<48xf32>)
-
-func @fold_tiled_loop_results(%A: memref<48xf32>, %B: tensor<48xf32>,
-    %C: memref<48xf32>, %C_tensor: tensor<48xf32>) -> tensor<48xf32> {
-  %c0 = arith.constant 0 : index
-  %c24 = arith.constant 24 : index
-  %c48 = arith.constant 48 : index
-  %useful, %useless = linalg.tiled_loop (%i) = (%c0) to (%c48) step (%c24)
-      ins (%A_ = %A: memref<48xf32>)
-      outs (%B_ = %B: tensor<48xf32>,
-            %CT_ = %C_tensor: tensor<48xf32>,
-            %C_ = %C: memref<48xf32>) {
-        %result = call @foo(%A_, %B_, %C_)
-          : (memref<48xf32>, tensor<48xf32>, memref<48xf32>)-> (tensor<48xf32>)
-    linalg.yield %result, %CT_ : tensor<48xf32>, tensor<48xf32>
-  }
-  return %useful : tensor<48xf32>
-}
-
-// CHECK-LABEL: func @fold_tiled_loop_results(
-// CHECK-SAME:   %[[A:.*]]: [[BUF_TY:memref<48xf32>]], %[[B:.*]]: [[TY:tensor<48xf32>]],
-// CHECK-SAME:   %[[C:.*]]: [[BUF_TY]],  %[[C_TENSOR:.*]]: [[TY]]) -> [[TY]] {
-
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C24:.*]] = arith.constant 24 : index
-// CHECK-DAG:  %[[C48:.*]] = arith.constant 48 : index
-
-// CHECK-NOT: %{{.*}} = linalg.tiled_loop
-// CHECK:  %[[RESULT:.*]] = linalg.tiled_loop (%{{.*}}) = (%[[C0]])
-// CHECK-SAME: to (%[[C48]]) step (%[[C24]])
-// CHECK-SAME: ins (%[[A_:.*]] = %[[A]]: [[BUF_TY]])
-// CHECK-SAME: outs (%[[B_:.*]] = %[[B]]: [[TY]], %[[C_:.*]] = %[[C]]: [[BUF_TY]]) {
-// CHECK-NEXT:   %[[RES:.*]] = call @foo(%[[A_]], %[[B_]], %[[C_]])
-// CHECK-NEXT:   linalg.yield %[[RES]] :
-
-// CHECK: return %[[RESULT]]
-
-// -----
-
-func private @foo(%A: memref<192xf32>, %B: tensor<192xf32>) -> tensor<192xf32>
-
-func @fold_tiled_loop_inputs(%A: memref<192xf32>, %A_tensor: tensor<192xf32>,
-                             %B_tensor: tensor<192xf32>) -> tensor<192xf32> {
-  %c0 = arith.constant 0 : index
-  %c24 = arith.constant 24 : index
-  %c192 = arith.constant 192 : index
-  %result = linalg.tiled_loop (%i) = (%c0) to (%c192) step (%c24)
-      ins (%A_ = %A: memref<192xf32>, %AT_ = %A_tensor: tensor<192xf32>)
-      outs (%BT_ = %B_tensor: tensor<192xf32>) {
-    %0 = call @foo(%A_, %BT_) : (memref<192xf32>, tensor<192xf32>) -> tensor<192xf32>
-    linalg.yield %0 : tensor<192xf32>
-  }
-  return %result : tensor<192xf32>
-}
-
-// CHECK-LABEL: func @fold_tiled_loop_inputs
-// CHECK: %[[RESULT:.*]] = linalg.tiled_loop
-// CHECK-SAME: ins (%{{.*}} = %{{.*}}: memref<192xf32>)
-
-// CHECK: return %[[RESULT]]
-
 // -----
 
 func private @some_use(%i : index, %j : index)
@@ -470,108 +381,6 @@ func @rank_reducing_init_extract(%sz : index, %idx : index) -> tensor<2xf32> {
 
 // -----
 
-// CHECK-LABEL: func @dim_of_tiled_loop_input_no_canonicalize(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   linalg.tiled_loop {{.*}} outs (%[[o:.*]] =
-//       CHECK:     %[[dim:.*]] = tensor.dim %[[o]], %[[c0]]
-//       CHECK:     arith.index_cast %[[dim]]
-func @dim_of_tiled_loop_input_no_canonicalize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = linalg.tiled_loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %inner_dim = tensor.dim %out1, %c0 : tensor<?x?xf32>
-    %cast1 = arith.index_cast %inner_dim : index to i32
-    %cast2 = arith.sitofp %cast1 : i32 to f32
-    %fill = linalg.fill(%cast2, %out1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-    %slice = tensor.extract_slice %fill[0, 0][%s, %s][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-    linalg.yield %slice : tensor<?x?xf32>
-  }
-  return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_tiled_loop_input(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   linalg.tiled_loop
-//       CHECK:     %[[dim:.*]] = tensor.dim %[[arg1]], %[[c0]]
-//       CHECK:     arith.index_cast %[[dim]]
-func @dim_of_tiled_loop_input(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = linalg.tiled_loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %inner_dim = tensor.dim %in1, %c0 : tensor<?x?xf32>
-    %cast1 = arith.index_cast %inner_dim : index to i32
-    %cast2 = arith.sitofp %cast1 : i32 to f32
-    %fill = linalg.fill(%cast2, %out1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-    linalg.yield %fill : tensor<?x?xf32>
-  }
-  return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_tiled_loop_result(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   tensor.dim %[[arg2]], %[[c0]]
-func @dim_of_tiled_loop_result(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> index {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = linalg.tiled_loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %1 = tensor.insert_slice %arg0 into %out1 [0, 0] [%s, %s] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-    linalg.yield %1 : tensor<?x?xf32>
-  }
-  %r2 = tensor.dim %r, %c0 : tensor<?x?xf32>
-  return %r2 : index
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_tiled_loop_result_no_canonicalize(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[r:.*]] = linalg.tiled_loop
-//       CHECK:   tensor.dim %[[r]], %[[c0]]
-func @dim_of_tiled_loop_result_no_canonicalize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> index {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = linalg.tiled_loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %1 = tensor.insert_slice %arg0 into %arg1 [0, 0] [%s, %s] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-    linalg.yield %1 : tensor<?x?xf32>
-  }
-  %r2 = tensor.dim %r, %c0 : tensor<?x?xf32>
-  return %r2 : index
-}
-
-// -----
-
 // CHECK: func @fold_self_copy
 func @fold_self_copy(%0 : memref<4x16xf32>) {
 // CHECK-NEXT: return

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
index 248a966a7a624..2d15bffbb0d64 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -639,7 +639,7 @@ func @scf_for_deps(
     %lb : index,
     %ub : index,
     %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
+  -> (tensor<?xf32>)
 {
   // %r0 must be out of place because one use of %t in the subsequent production
   // of %r1 is read.
@@ -666,38 +666,9 @@ func @scf_for_deps(
     scf.yield %t : tensor<?xf32>
   }
 
-  // %r2 must be out of place because one use of %t in the subsequent production
-  // of %r3 is read.
-  //      CHECK: linalg.tiled_loop
-  // CHECK-NEXT: call
-  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
-  // CHECK-NEXT: linalg.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
-  %r2 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
-        ins()
-        outs(%t = %B: tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    linalg.yield %t : tensor<?xf32>
-  }
-
-  // %r3 bufferizes inplace fine.
-  //      CHECK: linalg.tiled_loop
-  // CHECK-NEXT: call
-  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
-  // CHECK-NEXT: linalg.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
-  %r3 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
-        ins()
-        outs(%t = %B: tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    linalg.yield %t : tensor<?xf32>
-  }
-
   //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, 1]
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r1: tensor<?xf32>
 }
 
 // -----

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index b76ece00c3123..b1e086c9e7035 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -640,146 +640,6 @@ func private @print_memref_f32(tensor<*xf32>)
 
 // -----
 
-func private @some_use(memref<?xf32>)
-
-#TILE_MAP = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-
-//  CHECK-DAG: #[[$DYN_0D_MAP:.*]] = affine_map<()[s0] -> (s0)>
-//  CHECK-DAG: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-//  CHECK-DAG: #[[$TILE_MAP:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-
-//      CHECK:  func @tiled_dot(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[c:[a-zA-Z0-9]*]]: memref<f32, #[[$DYN_0D_MAP]]>
-func @tiled_dot(
-    %A: tensor<?xf32> {linalg.inplaceable = false},
-    %B: tensor<?xf32> {linalg.inplaceable = false},
-    %c: tensor<f32> {linalg.inplaceable = true},
-    %effecting: memref<?xf32>)
-  -> tensor<f32>
-{
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$DYN_1D_MAP:.*]]>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: linalg.tiled_loop {{.*}} to (%[[M]]) {{.*}} %[[A]]{{.*}}%[[B]]{{.*}}outs{{.*}}%[[c]]
-  // CHECK-NOT: copy
-  %1 = linalg.tiled_loop (%arg3) = (%c0) to (%0) step (%c3)
-       ins (%arg4 = %A: tensor<?xf32>, %use = %effecting : memref<?xf32>, %arg5 = %B: tensor<?xf32>)
-      outs (%arg6 = %c: tensor<f32>)
-      iterators["reduction"]
-  {
-    // CHECK-NOT:   alloc
-
-    %2 = tensor.dim %arg4, %c0 : tensor<?xf32>
-    %3 = affine.min #TILE_MAP(%arg3)[%2]
-
-    //     CHECK:   %[[SV_A:.*]] = memref.subview {{.*}}
-    %4 = tensor.extract_slice %arg4[%arg3] [%3] [1] : tensor<?xf32> to tensor<?xf32>
-    %5 = tensor.dim %arg5, %c0 : tensor<?xf32>
-    %6 = affine.min #TILE_MAP(%arg3)[%5]
-
-    //     CHECK:   %[[SV_B:.*]] = memref.subview {{.*}}
-    %7 = tensor.extract_slice %arg5[%arg3] [%6] [1] : tensor<?xf32> to tensor<?xf32>
-
-    //     CHECK:   linalg.dot ins(%[[SV_A]], %[[SV_B]] : memref<?xf32, #[[$DYN_1D_MAP:.*]]>, memref<?xf32, #[[$DYN_1D_MAP:.*]]>) outs(%{{.*}} : memref<f32, #[[$DYN_0D_MAP]]>)
-    %8 = linalg.dot ins(%4, %7 : tensor<?xf32>, tensor<?xf32>) outs(%arg6 : tensor<f32>) -> tensor<f32>
-
-    //     CHECK:   call @some_use(%{{.*}}) : (memref<?xf32>) -> ()
-    call @some_use(%use) : (memref<?xf32>) -> ()
-
-    linalg.yield %8 : tensor<f32>
-    //     CHECK:   linalg.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  return %1 : tensor<f32>
-}
-
-// -----
-
-#TILE_MAP = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-
-//  CHECK-DAG: #[[$DYN_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-//      CHECK:  func @tiled_fill(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_MAP]]>
-func @tiled_fill(%A: tensor<?xf32> {linalg.inplaceable = true}) -> tensor<?xf32> {
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-  %f0 = arith.constant 0.0 : f32
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$DYN_MAP:.*]]>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: linalg.tiled_loop {{.*}} to (%[[M]]) {{.*}} outs{{.*}}%[[A]]
-  %1 = linalg.tiled_loop (%arg3) = (%c0) to (%0) step (%c3)
-      outs (%arg1 = %A: tensor<?xf32>)
-      iterators["parallel"]
-  {
-    // CHECK-NOT:   alloc
-
-    %2 = tensor.dim %arg1, %c0 : tensor<?xf32>
-    %3 = affine.min #TILE_MAP(%arg3)[%2]
-
-    //     CHECK:   %[[SV_A:.*]] = memref.subview {{.*}}
-    %4 = tensor.extract_slice %arg1[%arg3] [%3] [1] : tensor<?xf32> to tensor<?xf32>
-
-    //     CHECK:   linalg.fill(%{{.*}}, %[[SV_A]]) : f32, memref<?xf32, #[[$DYN_MAP:.*]]>
-    %5 = linalg.fill(%f0, %4) : f32, tensor<?xf32> -> tensor<?xf32>
-    %6 = tensor.insert_slice %5 into %arg1[%arg3] [%3] [1] : tensor<?xf32> into tensor<?xf32>
-
-    linalg.yield %6 : tensor<?xf32>
-    //     CHECK:   linalg.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  return %1 : tensor<?xf32>
-}
-
-// -----
-
-//      CHECK:  func @tiled_loop_yield_out_of_place(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #{{.*}}>,
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #{{.*}}>
-func @tiled_loop_yield_out_of_place(
-    %A: tensor<?xf32> {linalg.inplaceable = true},
-    %B: tensor<?xf32> {linalg.inplaceable = true})
-  -> tensor<?xf32>
-{
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-  %f0 = arith.constant 0.0 : f32
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$DYN_MAP:.*]]>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: linalg.tiled_loop {{.*}} to (%[[M]]) {{.*}} outs{{.*}}%[[A]]
-  %1 = linalg.tiled_loop (%arg3) = (%c0) to (%0) step (%c3)
-      outs (%arg1 = %A: tensor<?xf32>)
-      iterators["parallel"]
-  {
-    // CHECK-NOT:   alloc
-    //     CHECK:   memref.copy %[[B]], %[[A]]
-    linalg.yield %B : tensor<?xf32>
-    //     CHECK:   linalg.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  return %1 : tensor<?xf32>
-}
-
-// -----
-
 // CHECK: #[[$DYNAMIC:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
 // CHECK: func private @external_func(memref<?xf32, #[[$DYNAMIC]]>)

diff  --git a/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir b/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
deleted file mode 100644
index e7689ac8b339f..0000000000000
--- a/mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: mlir-opt -test-linalg-distribution %s | FileCheck %s
-
-func private @foo(%A: tensor<64x64xf32>,
-                  %B: tensor<64x64xf32>) -> tensor<64x64xf32>
-
-func @distribute_for_gpu(%A: tensor<64x64xf32>,
-                         %B: tensor<64x64xf32>) -> tensor<64x64xf32> {
-  %c0 = arith.constant 0 : index
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c24 = arith.constant 24 : index
-  %0 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c64, %c64) step (%c24, %c16)
-      ins (%A_ = %A: tensor<64x64xf32>) outs (%B_ = %B:tensor<64x64xf32>)
-      distribution ["block_x", "block_y"] {
-    %0 = call @foo(%A_, %B_)
-      : (tensor<64x64xf32>, tensor<64x64xf32>) -> tensor<64x64xf32>
-    linalg.yield %0 : tensor<64x64xf32>
-  }
-  return %0 : tensor<64x64xf32>
-}
-
-// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 * 24)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 * 16)>
-
-// CHECK-LABEL: func @distribute_for_gpu
-// CHECK:  %[[C64:.*]] = arith.constant 64 : index
-
-// CHECK-DAG:  %[[GPU_BLOCK_X:.*]] = gpu.block_id x
-// CHECK-DAG:  %[[GPU_GRID_DIM_X:.*]] = gpu.grid_dim x
-// CHECK-DAG:  %[[LB_I:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[GPU_BLOCK_X]]]
-// CHECK-DAG:  %[[STEP_I:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[GPU_GRID_DIM_X]]]
-
-// CHECK-DAG:  %[[GPU_BLOCK_Y:.*]] = gpu.block_id y
-// CHECK-DAG:  %[[GPU_GRID_DIM_Y:.*]] = gpu.grid_dim y
-// CHECK-DAG:  %[[LB_J:.*]] = affine.apply #[[$MAP1]](){{\[}}%[[GPU_BLOCK_Y]]]
-// CHECK-DAG:  %[[STEP_J:.*]] = affine.apply #[[$MAP1]](){{\[}}%[[GPU_GRID_DIM_Y]]]
-
-// CHECK:  linalg.tiled_loop (%[[I:.*]], %[[J:.*]]) = (%[[LB_I]], %[[LB_J]])
-// CHECK-SAME: to (%[[C64]], %[[C64]]) step (%[[STEP_I]], %[[STEP_J]])

diff  --git a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
index 9eb0e35860f8f..ef315c96f16b4 100644
--- a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
@@ -1,5 +1,4 @@
 // RUN: mlir-opt %s -test-linalg-tensor-fusion-transform-patterns -resolve-shaped-type-result-dims -canonicalize -cse --split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-tiled-loop-fusion-transform-patterns -resolve-shaped-type-result-dims -canonicalize -cse  --split-input-file | FileCheck %s --check-prefix=TLOOP
 
 module {
   func @matmul_fusion(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
@@ -83,64 +82,6 @@ module {
 //      CHECK:   }
 //      CHECK:   return %[[RESULT]]
 
-// TLOOP-LABEL:  func @matmul_fusion(
-// TLOOP-SAME: %[[A:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME: %[[B:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME: %[[AB_INIT:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME: %[[C:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME: %[[ABC_INIT:[a-zA-Z0-9_]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-
-// TLOOP-DAG:  %[[C32:.*]] = arith.constant 32 : index
-// TLOOP-DAG:  %[[C64:.*]] = arith.constant 64 : index
-// TLOOP-DAG:  %[[C16:.*]] = arith.constant 16 : index
-// TLOOP-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// TLOOP:  %[[DIM_A0:.*]] = tensor.dim %[[A]], %[[C0]] : [[TY:.*]]
-
-// TLOOP:  %[[ABC:.*]] = linalg.tiled_loop (%[[IV0:.*]]) = (%[[C0]]) 
-// TLOOP-SAME: to (%[[DIM_A0]]) step (%[[C32]]) 
-// TLOOP-SAME: ins (%[[C_:.*]] = %[[C]]: tensor<?x?xf32>,
-// TLOOP-SAME:      %[[A_:.*]] = %[[A]]: tensor<?x?xf32>,
-// TLOOP-SAME:      %[[B_:.*]] = %[[B]]: tensor<?x?xf32>,
-// TLOOP-SAME:      %[[AB_INIT_:.*]] = %[[AB_INIT]]: tensor<?x?xf32>)
-// TLOOP-SAME: outs (%[[ABC_INIT_:.*]] = %[[ABC_INIT]]: tensor<?x?xf32>) {
-
-// TLOOP:    %[[ABC_INIT_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_]][%[[IV0]], 0]
-// TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[IV0]], 0]
-// TLOOP:    %[[AB_INIT_SUB:.*]] = tensor.extract_slice %[[AB_INIT_]][%[[IV0]], 0]
-
-// TLOOP:    %[[AB_SUB:.*]] = linalg.matmul
-// TLOOP-SAME:  ins(%[[A_SUB]], %[[B_]] : {{.*}}) outs(%[[AB_INIT_SUB]]
-
-// TLOOP:    %[[DIM_B_1:.*]] = tensor.dim %[[B]], %[[C1]] : [[TY]]
-// TLOOP:    %[[DIM_C_1:.*]] = tensor.dim %[[C]], %[[C1]] : [[TY]]
-
-// TLOOP:    %[[ABC_SUB_:.*]] = linalg.tiled_loop (%[[IV1:.*]], %[[IV2:.*]]) = 
-// TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_C_1]], %[[DIM_B_1]])
-// TLOOP-SAME: step (%[[C64]], %[[C16]]) 
-// TLOOP-SAME: ins (%[[AB_SUB_:.*]] = %[[AB_SUB]]: [[TY]],
-// TLOOP-SAME:      %[[C__:.*]] = %[[C_]]: [[TY]])
-// TLOOP-SAME: outs (%[[ABC_INIT_SUB_:.*]] = %[[ABC_INIT_SUB]]: [[TY]])
-// TLOOP-SAME: iterators["parallel", "reduction"] {
-
-// TLOOP:      %[[AB_SUB_SUB:.*]] = tensor.extract_slice %[[AB_SUB_]][0, %[[IV2]]]
-// TLOOP:      %[[C__SUB:.*]] = tensor.extract_slice %[[C__]][%[[IV2]], %[[IV1]]]
-// TLOOP:      %[[ABS_INIT_SUB_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_SUB_]][0, %[[IV1]]]
-
-// TLOOP:      %[[ABC_SUB_SUB:.*]] = linalg.matmul
-// TLOOP-SAME:  ins(%[[AB_SUB_SUB]], %[[C__SUB]] : [[TY]], [[TY]])
-// TLOOP-SAME:  outs(%[[ABS_INIT_SUB_SUB]] : [[TY]]) -> [[TY]]
-
-// TLOOP:      %[[RES0:.*]] = tensor.insert_slice %[[ABC_SUB_SUB]]
-// TLOOP-SAME:   into %[[ABC_INIT_SUB_]][0, %[[IV1]]]
-// TLOOP:      linalg.yield %[[RES0]] : [[TY]]
-// TLOOP:    }
-// TLOOP:    %[[RES1:.*]] = tensor.insert_slice %[[ABC_SUB_]] into %[[ABC_INIT_]][%[[IV0]], 0]
-// TLOOP:    linalg.yield %[[RES1]] : [[TY]]
-// TLOOP:  }
-// TLOOP:  return %[[ABC]] : [[TY]]
-
 // -----
 
 module {
@@ -195,48 +136,6 @@ module {
 //       CHECK:     scf.yield %[[YIELD]]
 //       CHECK:   return %[[RESULT]]
 
-// TLOOP-LABEL: func @matmul_plus_matmul
-// TLOOP-SAME:    %[[A:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME:    %[[B:[a-zA-Z0-9_]+]]: tensor<?x?xf32>,
-// TLOOP-SAME:    %[[AB:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-
-// TLOOP-DAG:  %[[C32:.*]] = arith.constant 32 : index
-// TLOOP-DAG:  %[[C64:.*]] = arith.constant 64 : index
-// TLOOP-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// TLOOP:  %[[DIM_A_0:.*]] = tensor.dim %[[A]], %[[C0]] : [[TY:.*]]
-// TLOOP:  %[[DIM_B_1:.*]] = tensor.dim %[[B]], %[[C1]] : [[TY]]
-
-// TLOOP:  %[[INIT:.*]] = linalg.init_tensor [%[[DIM_A_0]], %[[DIM_B_1]]]
-
-// TLOOP:  %[[RESULT:.*]] = linalg.tiled_loop (%[[IV0:.*]], %[[IV1:.*]]) =
-// TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_A_0]], %[[DIM_B_1]])
-// TLOOP-SAME: step (%[[C32]], %[[C64]])
-// TLOOP-SAME: ins (%[[A_:.*]] = %[[A]]: [[TY]],
-// TLOOP-SAME:      %[[B_:.*]] = %[[B]]: [[TY]],
-// TLOOP-SAME:      %[[AB_:.*]] = %[[AB]]: [[TY]])
-// TLOOP-SAME: outs (%[[INIT_:.*]] = %[[INIT]]: [[TY]]) {
-
-// TLOOP:    %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]][%[[IV0]], %[[IV1]]]
-// TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[IV0]], 0]
-// TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[IV1]]]
-// TLOOP:    %[[AB_SUB_INIT:.*]] = tensor.extract_slice %[[AB_]][%[[IV0]], %[[IV1]]]
-
-// TLOOP:    %[[AB_SUB:.*]] = linalg.matmul
-// TLOOP-SAME:  ins(%[[A_SUB]], %[[B_SUB]] : [[TY]], [[TY]])
-// TLOOP-SAME:  outs(%[[AB_SUB_INIT]] : [[TY]])
-
-// TLOOP:    %[[DOUBLE_AB:.*]] = linalg.generic
-// TLOOP-SAME:  ins(%[[AB_SUB]] : [[TY]]) outs(%[[INIT_SUB]] : [[TY]])
-
-// TLOOP:    %[[RESULT_SUB:.*]] = tensor.insert_slice
-// TLOOP-SAME:  %[[DOUBLE_AB:.*]] into %[[INIT_]][%[[IV0]], %[[IV1]]]
-
-// TLOOP:    linalg.yield %[[RESULT_SUB]] : [[TY]]
-// TLOOP:  }
-// TLOOP:  return %[[RESULT]] : [[TY]]
-
 // -----
 
 module {
@@ -270,59 +169,6 @@ module {
 //       CHECK:     %[[MM:.*]] = tensor.insert_slice %[[ST_MM_RES]] into {{.*}}
 //       CHECK:     scf.yield %[[MM]] : tensor<?x?xf32>
 
-
-// TLOOP-LABEL: func @matmul_out_fusion(
-// TLOOP-SAME:    %[[OUT:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-// TLOOP-SAME:    %[[A:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-// TLOOP-SAME:    %[[B:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-
-// TLOOP-DAG:  %[[C0_F32:.*]] = arith.constant 0.0
-// TLOOP-DAG:  %[[C32:.*]] = arith.constant 32 : index
-// TLOOP-DAG:  %[[C64:.*]] = arith.constant 64 : index
-// TLOOP-DAG:  %[[C16:.*]] = arith.constant 16 : index
-// TLOOP-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// TLOOP:  %[[DIM_A_0:.*]] = tensor.dim %[[A]], %[[C0]] : [[TY:.*]]
-// TLOOP:  %[[DIM_B_1:.*]] = tensor.dim %[[B]], %[[C1]] : [[TY]]
-
-// TLOOP:  %[[AB:.*]] = linalg.tiled_loop (%[[I:.*]], %[[J:.*]]) =
-// TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_A_0]], %[[DIM_B_1]])
-// TLOOP-SAME: step (%[[C32]], %[[C64]])
-// TLOOP-SAME: ins (%[[A_:.*]] = %[[A]]: [[TY]],
-// TLOOP-SAME:      %[[B_:.*]] = %[[B]]: [[TY]],
-// TLOOP-SAME:      %[[C0_F32_:.*]] = %[[C0_F32]]
-// TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
-
-// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A]], %[[C1]] : [[TY]]
-// TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
-// TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
-// TLOOP:    %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
-// TLOOP:    %[[INIT_SUB:.*]] = linalg.fill(%[[C0_F32_]], %[[OUT_SUB]])
-
-// TLOOP:    %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
-// TLOOP-SAME: to (%[[DIM_A__1]]) step (%[[C16]])
-// TLOOP-SAME: ins (%[[A_SUB_:.*]] = %[[A_SUB]]: [[TY]],
-// TLOOP-SAME:      %[[B_SUB_:.*]] = %[[B_SUB]]: [[TY]])
-// TLOOP-SAME: outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]: [[TY]])
-// TLOOP-SAME: iterators["reduction"] {
-
-// TLOOP:      %[[A_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB_]][0, %[[K]]]
-// TLOOP:      %[[B_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB_]][%[[K]], 0]
-// TLOOP:      %[[INIT_SUB_SUB:.*]] = tensor.extract_slice %[[INIT_SUB_]][0, 0]
-
-// TLOOP:      %[[AB_SUB_SUB:.*]] = linalg.matmul
-// TLOOP-SAME:   ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
-// TLOOP-SAME:   outs(%[[INIT_SUB_SUB]] : [[TY]]) -> [[TY]]
-// TLOOP:      %[[AB_SUB_:.*]] = tensor.insert_slice %[[AB_SUB_SUB]] into %[[INIT_SUB_]]
-// TLOOP:      linalg.yield %[[AB_SUB_]] : [[TY]]
-// TLOOP:    }
-// TLOOP:    %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
-// TLOOP-SAME:  into %[[OUT_]][%[[I]], %[[J]]]
-// TLOOP:    linalg.yield %[[SUB_RESULT]] : [[TY]]
-// TLOOP:  }
-// TLOOP:  return %[[AB]] : [[TY]]
-
 // -----
 
 module {
@@ -343,58 +189,3 @@ module {
     return %1 : tensor<?x?xf32>
   }
 }
-
-// TLOOP-LABEL: func @generic_plus_matmul(
-// TLOOP-SAME:    %[[OUT:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-// TLOOP-SAME:    %[[A:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-// TLOOP-SAME:    %[[B:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-
-// TLOOP-DAG:  %[[C0_F32:.*]] = arith.constant 0.0
-// TLOOP-DAG:  %[[C32:.*]] = arith.constant 32 : index
-// TLOOP-DAG:  %[[C64:.*]] = arith.constant 64 : index
-// TLOOP-DAG:  %[[C16:.*]] = arith.constant 16 : index
-// TLOOP-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// TLOOP:  %[[DIM_A_0:.*]] = tensor.dim %[[A]], %[[C0]] : [[TY:.*]]
-// TLOOP:  %[[DIM_B_1:.*]] = tensor.dim %[[B]], %[[C1]] : [[TY]]
-
-// TLOOP:  %[[AB:.*]] = linalg.tiled_loop (%[[I:.*]], %[[J:.*]]) =
-// TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_A_0]], %[[DIM_B_1]])
-// TLOOP-SAME: step (%[[C32]], %[[C64]])
-// TLOOP-SAME: ins (%[[A_:.*]] = %[[A]]: [[TY]],
-// TLOOP-SAME:      %[[B_:.*]] = %[[B]]: [[TY]],
-// TLOOP-SAME:      %[[C0_F32_:.*]] = %[[C0_F32]]
-// TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
-
-// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A]], %[[C1]] : [[TY]]
-// TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
-// TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
-// TLOOP:    %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
-// TLOOP:    %[[INIT_SUB:.*]] = linalg.generic
-// TLOOP-SAME: ins(%[[C0_F32_]]
-// TLOOP-SAME: outs(%[[OUT_SUB]]
-
-// TLOOP:    %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
-// TLOOP-SAME: to (%[[DIM_A__1]]) step (%[[C16]])
-// TLOOP-SAME: ins (%[[A_SUB_:.*]] = %[[A_SUB]]: [[TY]],
-// TLOOP-SAME:      %[[B_SUB_:.*]] = %[[B_SUB]]: [[TY]])
-// TLOOP-SAME: outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]: [[TY]])
-// TLOOP-SAME: iterators["reduction"] {
-
-// TLOOP:      %[[A_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB_]][0, %[[K]]]
-// TLOOP:      %[[B_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB_]][%[[K]], 0]
-// TLOOP:      %[[INIT_SUB_SUB:.*]] = tensor.extract_slice %[[INIT_SUB_]][0, 0]
-
-// TLOOP:      %[[AB_SUB_SUB:.*]] = linalg.matmul
-// TLOOP-SAME:   ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
-// TLOOP-SAME:   outs(%[[INIT_SUB_SUB]] : [[TY]]) -> [[TY]]
-// TLOOP:      %[[AB_SUB_:.*]] = tensor.insert_slice %[[AB_SUB_SUB]] into %[[INIT_SUB_]]
-// TLOOP:      linalg.yield %[[AB_SUB_]] : [[TY]]
-// TLOOP:    }
-// TLOOP:    %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
-// TLOOP-SAME:  into %[[OUT_]][%[[I]], %[[J]]]
-// TLOOP:    linalg.yield %[[SUB_RESULT]] : [[TY]]
-// TLOOP:  }
-// TLOOP:  return %[[AB]] : [[TY]]
-

diff  --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index c45c58abf91c8..081df97b7a0fc 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -411,110 +411,6 @@ func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2
 
 // -----
 
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> ()
-
-func @tiled_loop_incorrent_num_yield_operands(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %0 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c192, %c192)
-      step (%c24, %c24)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B: memref<192x192xf32>)
-      outs (%CT_ = %C_tensor: tensor<192x192xf32>,
-            %C_ = %C: memref<192x192xf32>) {
-        call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
-    // expected-error @+1 {{expected number of tensor output args = 1 to match the number of yield operands = 0}}
-    linalg.yield
-  }
-  return
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> tensor<f32>
-
-func @tiled_loop_incorrent_yield_operand_type(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %0 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c192, %c192)
-      step (%c24, %c24)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B: memref<192x192xf32>)
-      outs (%CT_ = %C_tensor: tensor<192x192xf32>,
-            %C_ = %C: memref<192x192xf32>) {
-        %1 = call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> tensor<f32>
-    // expected-error @+1 {{expected yield operand 0 with type = 'tensor<f32>' to match output arg type = 'tensor<192x192xf32>}}
-    linalg.yield %1 : tensor<f32>
-  }
-  return
-}
-
-// -----
-
-func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> ()
-
-func @tiled_loop_incorrent_iterator_types_count(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  // expected-error @+1 {{expected iterator types array attribute size = 1 to match the number of loops = 2}}
-  %0 = "linalg.tiled_loop"(%c0, %c0, %c192, %c192, %c24, %c24, %A, %B, %C_tensor, %C) ( {
-    ^bb0(%arg4: index, %arg5: index, %A_: memref<192x192xf32>,
-         %B_: memref<192x192xf32>, %CT_: tensor<192x192xf32>,
-         %C_: memref<192x192xf32>):
-      call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
-      linalg.yield %CT_ : tensor<192x192xf32>
-    }) {
-      iterator_types = ["parallel"],
-      operand_segment_sizes = dense<2> : vector<5xi32>
-    } : (index, index, index, index, index, index, memref<192x192xf32>,
-      memref<192x192xf32>, tensor<192x192xf32>, memref<192x192xf32>
-    ) -> tensor<192x192xf32>
-  return
-}
-
-// -----
-
-func private @foo(%A: memref<100xf32>) -> ()
-
-func @tiled_loop_incorrent_block_arg_type(%A: memref<192xf32>) {
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %c24 = arith.constant 24 : index
-  // expected-error @+1 {{expected output arg 0 with type = 'memref<192xf32>' to match region arg 1 type = 'memref<100xf32>'}}
-  "linalg.tiled_loop"(%c0, %c192, %c24, %A) ( {
-    ^bb0(%arg4: index, %A_: memref<100xf32>):
-      call @foo(%A_) : (memref<100xf32>)-> ()
-      linalg.yield
-    }) {
-      iterator_types = ["parallel"],
-      operand_segment_sizes = dense<[1, 1, 1, 0, 1]> : vector<5xi32>
-    } : (index, index, index, memref<192xf32>) -> ()
-  return
-}
-
-// -----
-
 #attrs = {
         indexing_maps = [
                 affine_map<(i) -> (3 - i)>,

diff  --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index 0c70868cbc32c..f2957f73e6221 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -6,8 +6,6 @@
 // Test that we can lower all the way to LLVM without crashing, don't check results here.
 // DISABLED: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1
 
-// CHECK-DAG: #[[$id_2d:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$id_1d:.*]] = affine_map<(d0, d1, d2) -> (d1)>
 // CHECK-DAG: #[[$strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // CHECK-DAG: #[[$strided2D:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
 // CHECK-DAG: #[[$strided3D:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
@@ -155,7 +153,7 @@ func @generic_without_inputs(%arg0 : memref<?x?x?xf32>) {
   linalg.generic  {indexing_maps = [#map0],
                    iterator_types = ["parallel", "parallel", "parallel"]}
                   outs(%arg0 : memref<?x?x?xf32>) {
-   ^bb0(%arg3: f32):  
+   ^bb0(%arg3: f32):
       %cst = arith.constant 0.000000e+00 : f32
       linalg.yield %cst : f32
     }
@@ -218,7 +216,7 @@ func @generic_with_multiple_tensor_outputs(
     iterator_types = ["reduction"]}
     ins(%arg0, %arg1 : tensor<?xi32>, tensor<?xi32>)
     outs(%1, %3 : tensor<i32>, tensor<i32>) {
-  ^bb0(%arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32):  
+  ^bb0(%arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32):
     %5 = arith.cmpi sge, %arg3, %arg5 : i32
     %6 = arith.select %5, %arg3, %arg5 : i32
     %7 = arith.cmpi eq, %arg3, %arg5 : i32
@@ -352,173 +350,3 @@ func @fill_tensor(%arg0 : index, %arg1 : index, %arg2 : f32) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 // CHECK: %{{.+}} = linalg.fill(%{{.+}}, %{{.+}}) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-
-// -----
-
-#accesses_4 = [
-  affine_map<(i, j) -> (i, j)>,
-  affine_map<(i, j) -> (i, j)>,
-  affine_map<(i, j) -> (i, j)>
-]
-
-#trait_4 = {
-  indexing_maps = #accesses_4,
-  iterator_types = ["parallel", "parallel"]
-}
-
-func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
-                 %out: tensor<24x64xi8>) -> tensor<24x64xi8> {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c4 = arith.constant 4 : index
- %c24 = arith.constant 24 : index
- %c64 = arith.constant 64 : index
- %prod = linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
-      ins(%lhs_ = %lhs: tensor<24x64xi8>, %rhs_ = %rhs: tensor<24x64xi8>)
-      outs(%out_ = %out: tensor<24x64xi8>) {
-    %lhs_sub = tensor.extract_slice %lhs_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-    %rhs_sub = tensor.extract_slice %rhs_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-    %out_sub = tensor.extract_slice %out_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-
-    %sum = linalg.generic #trait_4
-        ins(%lhs_sub, %rhs_sub : tensor<?x?xi8>, tensor<?x?xi8>)
-        outs(%out_sub : tensor<?x?xi8>) {
-      ^bb(%l: i8, %r: i8, %o: i8) :
-        %s = arith.addi %l, %r : i8
-        linalg.yield %s : i8
-      } -> tensor<?x?xi8>
-
-    %sum_sub = tensor.insert_slice %sum into %out_[%i, 0][%c4, %c64][1, 1]
-      : tensor<?x?xi8> into tensor<24x64xi8>
-    linalg.yield %sum_sub : tensor<24x64xi8>
-  }
-  return %prod : tensor<24x64xi8>
-}
-// CHECK-LABEL: func @tiled_loop
-// CHECK-NOT: iterators[
-
-// -----
-
-#id_3d = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-#id_2d = affine_map<(d0, d1, d2) -> (d0, d2)>
-#id_1d = affine_map<(d0, d1, d2) -> (d1)>
-
-#trait_5 = {
-  indexing_maps = [
-    #id_3d,
-    #id_2d,
-    #id_1d,
-    #id_1d
-  ],
-  iterator_types = ["reduction", "parallel", "reduction"]
-}
-
-func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
-                           %input_2d: tensor<16x32xf32>,
-                           %input_1d: tensor<24xf32>,
-                           %output: tensor<24xf32>) -> tensor<24xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c8 = arith.constant 8 : index
-  %X = tensor.dim %input_3d, %c0 : tensor<16x24x32xf32>
-  %Y = tensor.dim %input_3d, %c1 : tensor<16x24x32xf32>
-  %Z = tensor.dim %input_3d, %c2 : tensor<16x24x32xf32>
-  %result = linalg.tiled_loop (%i, %j, %k)
-      = (%c0, %c0, %c0) to (%X, %Y, %Z) step (%c2, %c4, %c8)
-      ins(%i3d_ = %input_3d: tensor<16x24x32xf32>,
-          %i2d_ = %input_2d: tensor<16x32xf32>,
-          %i1d_ = %input_1d: tensor<24xf32>)
-      outs(%o_ =  %output: tensor<24xf32>)
-      iterators["reduction", "parallel", "reduction"]
-      distribution["block_x", "block_y", "none"] {
-    %sub_3d = tensor.extract_slice %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
-      : tensor<16x24x32xf32> to tensor<2x4x8xf32>
-    %sub_2d = tensor.extract_slice %i2d_[%i, %k][2, 8][1, 1]
-      : tensor<16x32xf32> to tensor<2x8xf32>
-    %sub_1d = tensor.extract_slice %i1d_[%j] [4] [1]
-      : tensor<24xf32> to tensor<4xf32>
-    %sub_out = tensor.extract_slice %o_[%j] [4] [1]
-      : tensor<24xf32> to tensor<4xf32>
-    %acc = linalg.generic #trait_5
-      ins(%sub_3d, %sub_2d, %sub_1d
-        : tensor<2x4x8xf32>, tensor<2x8xf32>, tensor<4xf32>)
-      outs(%sub_out : tensor<4xf32>)  {
-    ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32):
-      %0 = arith.addf %i3d, %i2d : f32
-      %1 = arith.addf %0, %i1d : f32
-      linalg.yield %1 : f32
-    } -> tensor<4xf32>
-
-    %sum_sub = tensor.insert_slice %acc into %o_[%j][4][1]
-      : tensor<4xf32> into tensor<24xf32>
-    linalg.yield %sum_sub : tensor<24xf32>
-  }
-  return %result : tensor<24xf32>
-}
-// CHECK-LABEL: func @tiled_loop_reduction
-// CHECK: iterators[
-
-// -----
-
-#trait_6 = {
-  indexing_maps = [
-    #id_3d,
-    #id_2d,
-    #id_1d,
-    #id_1d
-  ],
-  iterator_types = ["reduction", "parallel", "reduction"]
-}
-#map_1 = affine_map<(d0, d1, d2)[s0] -> (d0 * 768 + s0 + d1 * 32 + d2)>
-#map_2 = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)>
-#map_3 = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func @tiled_loop_on_buffers(%input_3d: memref<16x24x32xf32>,
-                            %input_2d: memref<16x32xf32>,
-                            %input_1d: memref<24xf32>,
-                            %output: memref<24xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c8 = arith.constant 8 : index
-  %X = memref.dim %input_3d, %c0 : memref<16x24x32xf32>
-  %Y = memref.dim %input_3d, %c1 : memref<16x24x32xf32>
-  %Z = memref.dim %input_3d, %c2 : memref<16x24x32xf32>
-  linalg.tiled_loop (%i, %j, %k) = (%c0, %c0, %c0)
-      to (%X, %Y, %Z) step (%c2, %c4, %c8)
-      ins(%i3d_ = %input_3d: memref<16x24x32xf32>,
-          %i2d_ = %input_2d: memref<16x32xf32>,
-          %i1d_ = %input_1d: memref<24xf32>)
-      outs(%o_ =  %output: memref<24xf32>)
-      iterators["reduction", "parallel", "reduction"] {
-    %sub_3d = memref.subview %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
-      : memref<16x24x32xf32> to memref<2x4x8xf32, #map_1>
-    %sub_2d = memref.subview %i2d_[%i, %k][2, 8][1, 1]
-      : memref<16x32xf32> to memref<2x8xf32, #map_2>
-    %sub_1d = memref.subview %i1d_[%j] [4] [1]
-      : memref<24xf32> to memref<4xf32, #map_3>
-    %sub_out = memref.subview %o_[%j] [4] [1]
-      : memref<24xf32> to memref<4xf32, #map_3>
-    linalg.generic #trait_6
-      ins(%sub_3d, %sub_2d, %sub_1d
-        : memref<2x4x8xf32, #map_1>,
-          memref<2x8xf32, #map_2>,
-          memref<4xf32, #map_3>)
-      outs(%sub_out : memref<4xf32, #map_3>)  {
-    ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32):
-      %0 = arith.addf %i3d, %i2d : f32
-      %1 = arith.addf %0, %i1d : f32
-      linalg.yield %1 : f32
-    }
-    linalg.yield
-  }
-  return
-}
-// CHECK-LABEL: func @tiled_loop_on_buffers
-// CHECK: iterators[

diff  --git a/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
index b18b5044246a3..e1b2def4aadee 100644
--- a/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
@@ -4,12 +4,6 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-pattern tile-sizes=256,128,512 peeled-loops=1,2" -canonicalize | \
 // RUN:     FileCheck %s -check-prefix=CHECK-PEEL-12
 
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-pattern tile-sizes=256,128,512 loop-type=tiled_loop peeled-loops=0" -canonicalize | \
-// RUN:     FileCheck %s -check-prefix=CHECK-TILED-LOOP-PEEL-0
-
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-pattern tile-sizes=256,128,512 loop-type=tiled_loop peeled-loops=0,1" -canonicalize | \
-// RUN:     FileCheck %s -check-prefix=CHECK-TILED-LOOP-PEEL-01
-
 //     CHECK-PEEL-0: func @matmul_static_tensor
 // CHECK-PEEL-0-DAG:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-PEEL-0-DAG:   %[[c128:.*]] = arith.constant 128 : index
@@ -51,42 +45,6 @@
 //     CHECK-PEEL-12:       linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x36xf32>) outs({{.*}} : tensor<?x36xf32>)
 //     CHECK-PEEL-12:     }
 //     CHECK-PEEL-12:   }
-
-//     CHECK-TILED-LOOP-PEEL-0: func @matmul_static_tensor
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c512:.*]] = arith.constant 512 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c1280:.*]] = arith.constant 1280 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c1500:.*]] = arith.constant 1500 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c1600:.*]] = arith.constant 1600 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c1700:.*]] = arith.constant 1700 : index
-//     CHECK-TILED-LOOP-PEEL-0:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %[[c0]], %[[c0]]) to (%[[c1280]], %[[c1700]], %[[c1600]]) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-0:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-0:   }
-//     CHECK-TILED-LOOP-PEEL-0:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c1280]], %[[c0]], %[[c0]]) to (%[[c1500]], %[[c1700]], %[[c1600]]) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-0:     linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-0:   }
-
-//     CHECK-TILED-LOOP-PEEL-01: func @matmul_static_tensor
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c512:.*]] = arith.constant 512 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c1280:.*]] = arith.constant 1280 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c1500:.*]] = arith.constant 1500 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c1600:.*]] = arith.constant 1600 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c1664:.*]] = arith.constant 1664 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c1700:.*]] = arith.constant 1700 : index
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %[[c0]], %[[c0]]) to (%[[c1280]], %[[c1664]], %[[c1600]]) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x128xf32>) outs({{.*}} : tensor<256x128xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %[[c1664]], %[[c0]]) to (%[[c1280]], %[[c1700]], %[[c1600]]) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c1280]], %[[c0]], %[[c0]]) to (%[[c1500]], %[[c1700]], %[[c1600]]) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
 func @matmul_static_tensor(%arg0: tensor<1500x1600xf32>, %arg1: tensor<1600x1700xf32>)
     -> tensor<1500x1700xf32> {
   %out = linalg.init_tensor [1500, 1700] : tensor<1500x1700xf32>
@@ -138,33 +96,6 @@ func @matmul_static_tensor(%arg0: tensor<1500x1600xf32>, %arg1: tensor<1600x1700
 //     CHECK-PEEL-12:       }
 //     CHECK-PEEL-12:     }
 //     CHECK-PEEL-12:   }
-
-//     CHECK-TILED-LOOP-PEEL-0: func @matmul_dynamic_tensor
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-TILED-LOOP-PEEL-0-DAG:   %[[c512:.*]] = arith.constant 512 : index
-//     CHECK-TILED-LOOP-PEEL-0:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %[[c0]], %[[c0]]) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-0:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-0:   }
-//     CHECK-TILED-LOOP-PEEL-0:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%{{.*}}, %[[c0]], %[[c0]]) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-0:     linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-0:   }
-
-//     CHECK-TILED-LOOP-PEEL-01: func @matmul_dynamic_tensor
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-TILED-LOOP-PEEL-01-DAG:   %[[c512:.*]] = arith.constant 512 : index
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %[[c0]], %[[c0]]) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x128xf32>) outs({{.*}} : tensor<256x128xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%[[c0]], %{{.*}}, %[[c0]]) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
-//     CHECK-TILED-LOOP-PEEL-01:   linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) = (%{{.*}}, %[[c0]], %[[c0]]) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%[[c256]], %[[c128]], %[[c512]])
-//     CHECK-TILED-LOOP-PEEL-01:     linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-TILED-LOOP-PEEL-01:   }
 func @matmul_dynamic_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
     -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index

diff  --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index 3e56c21b049c3..a1a65fa289104 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -1,5 +1,4 @@
 // RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,4" -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,4 loop-type=tiled_loop distribution-types=block_x,block_y,none" -split-input-file | FileCheck %s -check-prefix=TLOOP
 
 // CHECK-LABEL: func @matmul_tensors(
 // CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
@@ -28,39 +27,6 @@ func @matmul_tensors(
   return %0 : tensor<?x?xf32>
 }
 
-// TLOOP-LABEL: func @matmul_tensors
-// TLOOP-SAME: (%[[ARG_0:.*]]: [[TY:.*]], %[[ARG_1:.*]]: [[TY]],
-// TLOOP-SAME: %[[ARG_2:.*]]: [[TY]]) -> [[TY]] {
-
-// TLOOP-DAG: %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG: %[[C1:.*]] = arith.constant 1 : index
-// TLOOP-DAG: %[[C2:.*]] = arith.constant 2 : index
-// TLOOP-DAG: %[[C3:.*]] = arith.constant 3 : index
-// TLOOP-DAG: %[[C4:.*]] = arith.constant 4 : index
-
-// TLOOP: %[[ARG_0_X:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : [[TY]]
-// TLOOP: %[[ARG_0_Y:.*]] = tensor.dim %[[ARG_0]], %[[C1]] : [[TY]]
-// TLOOP: %[[ARG_1_Y:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : [[TY]]
-
-// TLOOP: %{{.*}} = linalg.tiled_loop (%[[I:.*]], %[[J:.*]], %[[K:.*]]) =
-// TLOOP-SAME: (%[[C0]], %[[C0]], %[[C0]])
-// TLOOP-SAME: to (%[[ARG_0_X]], %[[ARG_1_Y]], %[[ARG_0_Y]])
-// TLOOP-SAME: step (%[[C2]], %[[C3]], %[[C4]])
-// TLOOP-SAME: ins (%[[A0:.*]] = %[[ARG_0]]: [[TY]], %[[A1:.*]] = %[[ARG_1]]: [[TY]])
-// TLOOP-SAME: outs (%[[A2:.*]] = %[[ARG_2]]: [[TY]])
-// TLOOP-SAME: iterators["parallel", "parallel", "reduction"]
-// TLOOP-SAME: distribution["block_x", "block_y", "none"] {
-
-// TLOOP: %[[SUB_ARG_0:.*]] = tensor.extract_slice %[[A0]][%[[I]], %[[K]]]
-// TLOOP: %[[SUB_ARG_1:.*]] = tensor.extract_slice %[[A1]][%[[K]], %[[J]]]
-// TLOOP: %[[SUB_ARG_2:.*]] = tensor.extract_slice %[[A2]][%[[I]], %[[J]]]
-
-// TLOOP: %[[PROD:.*]] = linalg.matmul ins(%[[SUB_ARG_0]], %[[SUB_ARG_1]]
-// TLOOP-SE: outs(%[[SUB_ARG_2]] : [[TY]]) -> [[TY]]
-
-// TLOOP: %[[O:.*]] = tensor.insert_slice %[[PROD]] into %[[A2]][%[[I]], %[[J]]]
-// TLOOP: linalg.yield %[[O]] : [[TY]]
-
 // -----
 
 func @generic_op_tensors(
@@ -108,29 +74,6 @@ func @generic_op_tensors(
 //       CHECK: }
 //       CHECK: return %[[TD0]]
 
-// TLOOP-LABEL: func @generic_op_tensors(
-// TLOOP-SAME:    %[[ARG_0:.*]]: [[TY:.*]],
-// TLOOP-SAME:    %[[ARG_1:.*]]: [[TY]]) -> [[TY]] {
-
-// TLOOP-DAG: %[[C0:.*]] = arith.constant 0 : index
-// TLOOP-DAG: %[[C1:.*]] = arith.constant 1 : index
-// TLOOP-DAG: %[[C2:.*]] = arith.constant 2 : index
-// TLOOP-DAG: %[[C3:.*]] = arith.constant 3 : index
-// TLOOP-DAG: %[[C4:.*]] = arith.constant 4 : index
-
-// TLOOP:     %[[INIT:.*]] = linalg.init_tensor
-// TLOOP:     %[[ARG_0_X:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : [[TY]]
-// TLOOP:     %[[ARG_0_Y:.*]] = tensor.dim %[[ARG_0]], %[[C1]] : [[TY]]
-// TLOOP:     %[[ARG_0_Z:.*]] = tensor.dim %[[ARG_0]], %[[C2]] : [[TY]]
-
-// TLOOP:     %{{.*}} = linalg.tiled_loop (%{{.*}}, %{{.*}}, %{{.*}}) =
-// TLOOP-SAME: (%[[C0]], %[[C0]], %[[C0]])
-// TLOOP-SAME: to (%[[ARG_0_X]], %[[ARG_0_Y]], %[[ARG_0_Z]])
-// TLOOP-SAME: step (%[[C2]], %[[C3]], %[[C4]])
-// TLOOP-SAME: ins (%{{.*}} = %[[ARG_0]]: [[TY]], %{{.*}} = %[[ARG_1]]: [[TY]])
-// TLOOP-SAME: outs (%{{.*}} = %[[INIT]]: [[TY]])
-// TLOOP-SAME: distribution["block_x", "block_y", "none"] {
-
 // -----
 
 //  CHECK-DAG:  #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>

diff  --git a/mlir/test/Dialect/Linalg/tiled-loop-peeling.mlir b/mlir/test/Dialect/Linalg/tiled-loop-peeling.mlir
deleted file mode 100644
index 106fcee1b130e..0000000000000
--- a/mlir/test/Dialect/Linalg/tiled-loop-peeling.mlir
+++ /dev/null
@@ -1,231 +0,0 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-linalg-transform-patterns=test-tiled-loop-peeling=2 -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-2
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-linalg-transform-patterns=test-tiled-loop-peeling=0,1,2 -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-012
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-linalg-transform-patterns="test-tiled-loop-peeling=0,1,2 skip-partial" -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-012-SKIP-PARTIAL
-
-// CHECK-TILE-2-LABEL: func @tiled_loop_3d_tensor(
-//  CHECK-TILE-2-SAME:     %[[input:.*]]: tensor<?x?x?xf32>, %[[s0:.*]]: index, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-TILE-2-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//   CHECK-TILE-2-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//   CHECK-TILE-2-DAG:   %[[c2:.*]] = arith.constant 2 : index
-//       CHECK-TILE-2:   %[[dim0:.*]] = tensor.dim %[[input]], %[[c0]]
-//       CHECK-TILE-2:   %[[dim1:.*]] = tensor.dim %[[input]], %[[c1]]
-//       CHECK-TILE-2:   %[[dim2:.*]] = tensor.dim %[[input]], %[[c2]]
-//       CHECK-TILE-2:   %[[init_tensor:.*]] = linalg.init_tensor
-//       CHECK-TILE-2:   %[[split_bound:.*]] = affine.apply
-//       CHECK-TILE-2:   %[[r1:.*]] = linalg.tiled_loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[c0]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in1:.*]] = %[[input]]: tensor<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out1:.*]] = %[[init_tensor]]: tensor<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_1:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_1:.*]] = affine.min
-//       CHECK-TILE-2:     %[[in_slice1:.*]] = tensor.extract_slice %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     %[[out_slice1:.*]] = tensor.extract_slice %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     %[[mod_slice1:.*]] = tensor.insert_slice %{{.*}} into %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     linalg.yield %[[mod_slice1]]
-//       CHECK-TILE-2:   %[[r2:.*]] = linalg.tiled_loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[dim2]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in2:.*]] = %[[input]]: tensor<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out2:.*]] = %[[r1]]: tensor<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[apply2:.*]] = affine.apply
-//       CHECK-TILE-2:     %[[in_slice2:.*]] = tensor.extract_slice %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     %[[out_slice2:.*]] = tensor.extract_slice %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     %[[mod_slice2:.*]] = tensor.insert_slice %{{.*}} into %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     linalg.yield %[[mod_slice2]]
-//       CHECK-TILE-2:   return %[[r2]]
-
-// CHECK-TILE-012-LABEL: func @tiled_loop_3d_tensor
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   linalg.tiled_loop {{.*}} {
-//       CHECK-TILE-012:     linalg.yield
-//       CHECK-TILE-012:   }
-//   CHECK-TILE-012-NOT: linalg.tiled_loop
-
-//      CHECK-TILE-012-SKIP-PARTIAL: func @tiled_loop_3d_tensor(
-// CHECK-TILE-012-SKIP-PARTIAL-SAME:     %[[input:.*]]: tensor<?x?x?xf32>
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c2:.*]] = arith.constant 2 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim0:.*]] = tensor.dim %[[input]], %[[c0]]
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim1:.*]] = tensor.dim %[[input]], %[[c1]]
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim2:.*]] = tensor.dim %[[input]], %[[c2]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p0:.*]] = affine.apply #{{.*}}()[%[[dim0]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p1:.*]] = affine.apply #{{.*}}()[%[[dim1]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p2:.*]] = affine.apply #{{.*}}()[%[[dim2]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   linalg.tiled_loop {{.*}} = (%[[c0]], %[[c0]], %[[c0]]) to (%[[p0]], %[[p1]], %[[p2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   linalg.tiled_loop {{.*}} = (%[[c0]], %[[c0]], %[[p2]]) to (%[[p0]], %[[p1]], %[[dim2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   linalg.tiled_loop {{.*}} = (%[[c0]], %[[p1]], %[[c0]]) to (%[[p0]], %[[dim1]], %[[dim2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   linalg.tiled_loop {{.*}} = (%[[p0]], %[[c0]], %[[c0]]) to (%[[dim0]], %[[dim1]], %[[dim2]])
-func @tiled_loop_3d_tensor(%arg0: tensor<?x?x?xf32>, %s0: index, %s1: index,
-                           %s2: index) -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = linalg.init_tensor [%dim0, %dim1, %dim2] : tensor<?x?x?xf32>
-  %result = linalg.tiled_loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %dim2)
-           step (%s0, %s1, %s2) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %min0 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg1, %s0)[%dim0]
-    %min1 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg2, %s1)[%dim1]
-    %min2 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg3, %s2)[%dim2]
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    linalg.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  return %result : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-TILE-2-LABEL: func @tiled_loop_3d_memref(
-//  CHECK-TILE-2-SAME:     %[[input:.*]]: memref<?x?x?xf32>, %[[output:.*]]: memref<?x?x?xf32>, %[[s0:.*]]: index, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-TILE-2-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//   CHECK-TILE-2-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//   CHECK-TILE-2-DAG:   %[[c2:.*]] = arith.constant 2 : index
-//       CHECK-TILE-2:   %[[dim0:.*]] = memref.dim %[[input]], %[[c0]]
-//       CHECK-TILE-2:   %[[dim1:.*]] = memref.dim %[[input]], %[[c1]]
-//       CHECK-TILE-2:   %[[dim2:.*]] = memref.dim %[[input]], %[[c2]]
-//       CHECK-TILE-2:   %[[split_bound:.*]] = affine.apply
-//       CHECK-TILE-2:   linalg.tiled_loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[c0]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in1:.*]] = %[[input]]: memref<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out1:.*]] = %[[output]]: memref<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_1:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_1:.*]] = affine.min
-//       CHECK-TILE-2:     memref.subview %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     linalg.yield
-//       CHECK-TILE-2:   linalg.tiled_loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[dim2]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in2:.*]] = %[[input]]: memref<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out2:.*]] = %[[output]]: memref<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[apply2:.*]] = affine.apply
-//       CHECK-TILE-2:     memref.subview %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     linalg.yield
-//       CHECK-TILE-2:   return
-
-// CHECK-TILE-012-LABEL: func @tiled_loop_3d_memref
-
-!memref_subview_type = type memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>>
-
-func @tiled_loop_3d_memref(%arg0: memref<?x?x?xf32>, %output: memref<?x?x?xf32>,
-                           %s0: index, %s1: index, %s2: index) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %dim0 = memref.dim %arg0, %c0 : memref<?x?x?xf32>
-  %dim1 = memref.dim %arg0, %c1 : memref<?x?x?xf32>
-  %dim2 = memref.dim %arg0, %c2 : memref<?x?x?xf32>
-  linalg.tiled_loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %dim2)
-           step (%s0, %s1, %s2) ins (%arg4 = %arg0: memref<?x?x?xf32>)
-           outs (%arg5 = %output : memref<?x?x?xf32>) {
-    %min0 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg1, %s0)[%dim0]
-    %min1 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg2, %s1)[%dim1]
-    %min2 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg3, %s2)[%dim2]
-    %in_slice = memref.subview %arg4[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1]: memref<?x?x?xf32> to !memref_subview_type
-    "computation"(%in_slice) : (!memref_subview_type) -> memref<?x?x?xf32>
-    linalg.yield
-  }
-  return
-}
-
-// -----
-
-// CHECK-TILE-2-LABEL: func @step_1_do_not_peel
-//       CHECK-TILE-2:   linalg.tiled_loop
-//   CHECK-TILE-2-NOT:   linalg.tiled_loop
-
-// CHECK-TILE-012-LABEL: func @step_1_do_not_peel
-
-func @step_1_do_not_peel(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = linalg.init_tensor [%dim0, %dim1, %dim2] : tensor<?x?x?xf32>
-  %result = linalg.tiled_loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %dim2)
-           step (%c1, %c1, %c1) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    linalg.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  return %result : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-TILE-2-LABEL: func @divides_evenly_do_not_peel
-//       CHECK-TILE-2:   linalg.tiled_loop
-//   CHECK-TILE-2-NOT:   linalg.tiled_loop
-
-// CHECK-TILE-012-LABEL: func @divides_evenly_do_not_peel
-
-func @divides_evenly_do_not_peel(%arg0: tensor<?x?x?xf32>, %s: index)
-    -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %c64 = arith.constant 64 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = linalg.init_tensor [%dim0, %dim1, %dim2] : tensor<?x?x?xf32>
-  %result = linalg.tiled_loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %c64)
-           step (%s, %s, %c8) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    linalg.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  return %result : tensor<?x?x?xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/tiled-loop-to-scf.mlir b/mlir/test/Dialect/Linalg/tiled-loop-to-scf.mlir
deleted file mode 100644
index 08d81b4f96411..0000000000000
--- a/mlir/test/Dialect/Linalg/tiled-loop-to-scf.mlir
+++ /dev/null
@@ -1,184 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-tiled-loops-to-scf --split-input-file | FileCheck %s
-
-
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func @tiled_loop(%A: memref<192x192xf32>,
-                 %B: memref<192x192xf32>,
-                 %C: memref<192x192xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c24 = arith.constant 24 : index
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-
-  linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c192, %c192) step (%c24, %c16)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B:  memref<192x192xf32>)
-      outs (%C_ = %C: memref<192x192xf32>) {
-    %0 = affine.min #map0(%i)
-    %1 = memref.subview %A_[%i, 0] [%0, 192] [1, 1]
-      : memref<192x192xf32> to memref<?x192xf32, #map1>
-    %2 = affine.min #map2(%j)
-    %3 = memref.subview %B_[0, %j] [192, %2] [1, 1]
-      : memref<192x192xf32> to memref<192x?xf32, #map1>
-    %4 = memref.subview %C_[%i, %j] [%0, %2] [1, 1]
-      : memref<192x192xf32> to memref<?x?xf32, #map1>
-    linalg.fill(%cst, %4) : f32, memref<?x?xf32, #map1>
-    linalg.matmul ins(%1, %3 : memref<?x192xf32, #map1>,
-                               memref<192x?xf32, #map1>)
-                  outs(%4 : memref<?x?xf32, #map1>)
-    linalg.yield
-  }
-  return
-}
-
-// CHECK-LABEL: @tiled_loop
-// CHECK-SAME:  %[[A:.*]]: memref<192x192xf32>, %[[B:.*]]: memref<192x192xf32>,
-// CHECK-SAME:  %[[C:.*]]: memref<192x192xf32>) {
-// CHECK:       %[[C24:.*]] = arith.constant 24 : index
-// CHECK:       %[[C16:.*]] = arith.constant 16 : index
-// CHECK:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK:       %[[C192:.*]] = arith.constant 192 : index
-// CHECK:       scf.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:      to (%[[C192]], %[[C192]]) step (%[[C24]], %[[C16]]) {
-// CHECK:         %[[A_sub:.*]] = memref.subview %[[A]][%[[I]]
-// CHECK:         %[[B_sub:.*]] = memref.subview %[[B]][0, %[[J]]]
-// CHECK:         %[[C_sub:.*]] = memref.subview %[[C]][%[[I]]
-// CHECK:         linalg.fill
-// CHECK:         linalg.matmul
-
-// -----
-
-func @tiled_loop_reduction(%A: memref<192x192xf32>,
-                           %B: memref<192x192xf32>,
-                           %C: memref<f32>) {
-   %c24 = arith.constant 24 : index
-   %c16 = arith.constant 16 : index
-   %c0 = arith.constant 0 : index
-   %c192 = arith.constant 192 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c192, %c192) step (%c24, %c16)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B:  memref<192x192xf32>)
-      outs (%C_ = %C: memref<f32>)
-      iterators["reduction", "reduction"] {
-    linalg.fill(%cst, %A_) : f32, memref<192x192xf32>
-    linalg.yield
-  }
-  return
-}
-
-// CHECK-LABEL: @tiled_loop_reduction
-// CHECK:       %[[C24:.*]] = arith.constant 24 : index
-// CHECK:       %[[C16:.*]] = arith.constant 16 : index
-// CHECK:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK:       %[[C192:.*]] = arith.constant 192 : index
-// CHECK:       scf.for %{{.*}} = %[[C0]] to %[[C192]] step %[[C24]]
-// CHECK:         scf.for %{{.*}} = %[[C0]] to %[[C192]] step %[[C16]]
-// CHECK:           linalg.fill
-
-// -----
-
-#strided_1d = affine_map<(d0)[s0] -> (d0 + s0)>
-#strided_2d = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
-
-func @tiled_loop_row_reduction(%A: memref<10x8xf32>,
-                               %B: memref<8xf32>) {
-   %c0 = arith.constant 0 : index
-   %c2 = arith.constant 2 : index
-   %c4 = arith.constant 4 : index
-   %c8 = arith.constant 8 : index
-   %c10 = arith.constant 10 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c10, %c8) step (%c2, %c4)
-      ins (%A_ = %A: memref<10x8xf32>)
-      outs (%B_ = %B: memref<8xf32>)
-      iterators["reduction", "parallel"] {
-    %A_sub = memref.subview %A_[%i, %j][2, 4][1, 1]
-      : memref<10x8xf32> to memref<2x4xf32, #strided_2d>
-    %B_sub = memref.subview %B_[%j][4][1]
-      : memref<8xf32> to memref<4xf32, #strided_1d>
-    linalg.generic {
-        indexing_maps = [affine_map<(i, j) -> (i, j)>,
-                         affine_map<(i, j) -> (j)>],
-        iterator_types = ["reduction", "parallel"]}
-        ins(%A_sub : memref<2x4xf32, #strided_2d>)
-        outs(%B_sub : memref<4xf32, #strided_1d>) {
-      ^bb(%a: f32, %b: f32) :
-        %0 = arith.addf %a, %b: f32
-        linalg.yield %0 : f32
-      }
-    linalg.yield
-  }
-  return
-}
-
-// CHECK-LABEL: @tiled_loop_row_reduction
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
-
-// CHECK:     scf.parallel (%[[J:.*]]) = (%[[C0]]) to (%[[C8]]) step (%[[C4]])
-// CHECK-NEXT:  scf.for %[[I:.*]] = %[[C0]] to %[[C10]] step %[[C2]]
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]], %[[J]]] [2, 4] [1, 1]
-// CHECK-SAME:      : memref<10x8xf32> to memref<2x4xf32, #map{{[0-9]+}}>
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[J]]] [4] [1]
-// CHECK-SAME:      : memref<8xf32> to memref<4xf32, #map{{[0-9]+}}>
-
-// -----
-
-#strided_1d = affine_map<(d0)[s0] -> (d0 + s0)>
-#strided_2d = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
-
-func @tiled_loop_col_reduction(%A: memref<10x8xf32>,
-                               %B: memref<10xf32>) {
-   %c0 = arith.constant 0 : index
-   %c2 = arith.constant 2 : index
-   %c4 = arith.constant 4 : index
-   %c8 = arith.constant 8 : index
-   %c10 = arith.constant 10 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c10, %c8) step (%c2, %c4)
-      ins (%A_ = %A: memref<10x8xf32>)
-      outs (%B_ = %B: memref<10xf32>)
-      iterators["parallel", "reduction"] {
-    %A_sub = memref.subview %A_[%i, %j][2, 4][1, 1]
-      : memref<10x8xf32> to memref<2x4xf32, #strided_2d>
-    %B_sub = memref.subview %B_[%i][2][1]
-      : memref<10xf32> to memref<2xf32, #strided_1d>
-    linalg.generic {
-        indexing_maps = [affine_map<(i, j) -> (i, j)>,
-                         affine_map<(i, j) -> (i)>],
-        iterator_types = ["parallel", "reduction"]}
-        ins(%A_sub : memref<2x4xf32, #strided_2d>)
-        outs(%B_sub : memref<2xf32, #strided_1d>) {
-      ^bb(%a: f32, %b: f32) :
-        %0 = arith.addf %a, %b: f32
-        linalg.yield %0 : f32
-      }
-    linalg.yield
-  }
-  return
-}
-
-// CHECK-LABEL: @tiled_loop_col_reduction
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
-
-// CHECK:     scf.parallel (%[[I:.*]]) = (%[[C0]]) to (%[[C10]]) step (%[[C2]])
-// CHECK-NEXT:  scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]]
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]], %[[J]]] [2, 4] [1, 1]
-// CHECK-SAME:      : memref<10x8xf32> to memref<2x4xf32, #map{{[0-9]+}}>
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]]] [2] [1]
-// CHECK-SAME:      : memref<10xf32> to memref<2xf32, #map{{[0-9]+}}>

diff  --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index c74fb756b785f..1fe3db2e9e676 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -1,7 +1,6 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRLinalgTestPasses
   TestLinalgCodegenStrategy.cpp
-  TestLinalgDistribution.cpp
   TestLinalgElementwiseFusion.cpp
   TestLinalgFusionTransforms.cpp
   TestLinalgHoisting.cpp

diff  --git a/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp
deleted file mode 100644
index 342fed37ad600..0000000000000
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgDistribution.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===- TestLinalgDistribution.cpp - Test Linalg hoisting functions --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements logic for testing Linalg hoisting functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-using namespace mlir;
-using namespace mlir::linalg;
-
-template <gpu::Dimension Dim>
-static linalg::ProcInfo getGpuBlockInfo(OpBuilder &b, Location loc) {
-  Type indexType = b.getIndexType();
-  ProcInfo procInfo = {b.create<gpu::BlockIdOp>(loc, indexType, Dim),
-                       b.create<gpu::GridDimOp>(loc, indexType, Dim)};
-  return procInfo;
-}
-
-static LinalgLoopDistributionOptions getDistributionOptions() {
-  LinalgLoopDistributionOptions opts;
-  opts.procInfoMap.insert(
-      std::make_pair("block_x", getGpuBlockInfo<gpu::Dimension::x>));
-  opts.procInfoMap.insert(
-      std::make_pair("block_y", getGpuBlockInfo<gpu::Dimension::y>));
-  return opts;
-}
-
-namespace {
-struct TestLinalgDistribution
-    : public PassWrapper<TestLinalgDistribution, OperationPass<FuncOp>> {
-  StringRef getArgument() const final { return "test-linalg-distribution"; }
-  StringRef getDescription() const final { return "Test Linalg distribution."; }
-  TestLinalgDistribution() = default;
-  TestLinalgDistribution(const TestLinalgDistribution &pass) = default;
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, gpu::GPUDialect>();
-  }
-
-  void runOnOperation() override;
-};
-} // namespace
-
-void TestLinalgDistribution::runOnOperation() {
-  auto funcOp = getOperation();
-  RewritePatternSet distributeTiledLoopsPatterns(&getContext());
-  populateLinalgDistributeTiledLoopPattern(
-      distributeTiledLoopsPatterns, getDistributionOptions(),
-      LinalgTransformationFilter(
-          ArrayRef<StringAttr>{},
-          {StringAttr::get(funcOp.getContext(), "distributed")})
-          .addFilter([](Operation *op) {
-            return success(!op->getParentOfType<linalg::TiledLoopOp>());
-          }));
-  (void)applyPatternsAndFoldGreedily(funcOp,
-                                     std::move(distributeTiledLoopsPatterns));
-  // Ensure we drop the marker in the end.
-  funcOp.walk([](LinalgOp op) {
-    op->removeAttr(LinalgTransforms::kLinalgTransformMarker);
-  });
-}
-
-namespace mlir {
-namespace test {
-void registerTestLinalgDistribution() {
-  PassRegistration<TestLinalgDistribution>();
-}
-} // namespace test
-} // namespace mlir

diff  --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 8af5c46433631..c9c44bfc812ba 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -120,10 +120,6 @@ struct TestLinalgTransforms
       *this, "tile-sizes",
       llvm::cl::desc("Linalg tile sizes for test-tile-pattern"),
       llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
-  ListOption<unsigned> testTiledLoopPeeling{
-      *this, "test-tiled-loop-peeling",
-      llvm::cl::desc("Test peeling of linalg.tiled_loop ops"),
-      llvm::cl::OneOrMore, llvm::cl::MiscFlags::CommaSeparated};
   Option<bool> skipPartial{
       *this, "skip-partial",
       llvm::cl::desc("Skip loops inside partial iterations during peeling"),
@@ -605,8 +601,7 @@ static void applyTilePattern(FuncOp funcOp, const std::string &loopType,
       llvm::StringSwitch<LinalgTilingLoopType>(loopType)
           .Case("for", LinalgTilingLoopType::Loops)
           .Case("affine", LinalgTilingLoopType::AffineLoops)
-          .Case("parallel", LinalgTilingLoopType::ParallelLoops)
-          .Case("tiled_loop", LinalgTilingLoopType::TiledLoops);
+          .Case("parallel", LinalgTilingLoopType::ParallelLoops);
   auto linalgTilingOptions = linalg::LinalgTilingOptions()
                                  .setPeeledLoops(peeledLoops)
                                  .setLoopType(type);
@@ -626,76 +621,6 @@ static void applyTilePattern(FuncOp funcOp, const std::string &loopType,
 static constexpr char kPeeledLoopsLabel[] = "__peeled_loops__";
 static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
 
-namespace {
-/// Peel TiledLoopOps, i.e., split them into two loops: One loop where the
-/// `idx`-th loop contains only "full" iterations and a second loop for the
-/// remaining partial iteration (if any).
-struct TiledLoopPeelingPattern : public OpRewritePattern<TiledLoopOp> {
-  TiledLoopPeelingPattern(MLIRContext *ctx, int64_t idx, bool skipPartial)
-      : OpRewritePattern<TiledLoopOp>(ctx), idx(idx), skipPartial(skipPartial) {
-  }
-
-  LogicalResult matchAndRewrite(TiledLoopOp loopOp,
-                                PatternRewriter &rewriter) const override {
-    SmallVector<int64_t> peeledLoops;
-    if (loopOp->hasAttr(kPeeledLoopsLabel)) {
-      auto attr = loopOp->getAttr(kPeeledLoopsLabel).cast<ArrayAttr>();
-      peeledLoops =
-          llvm::to_vector<4>(llvm::map_range(attr, [](Attribute attr) {
-            return attr.cast<IntegerAttr>().getInt();
-          }));
-      // Check if the loop was already peeled.
-      if (llvm::find(peeledLoops, idx) != peeledLoops.end())
-        return failure();
-    }
-    if (skipPartial && loopOp->hasAttr(kPartialIterationLabel))
-      // No peeling of loop nests with a partial iteration.
-      return failure();
-
-    if (static_cast<int64_t>(loopOp.iterator_types().size()) <= idx)
-      return failure();
-
-    // Peel loop and canonicalize.
-    TiledLoopOp result;
-    if (failed(linalg::peelAndCanonicalizeTiledLoop(rewriter, loopOp, idx,
-                                                    result)))
-      return failure();
-
-    // Apply label, so that the same loop is not rewritten a second time.
-    peeledLoops.push_back(idx);
-    rewriter.updateRootInPlace(loopOp, [&]() {
-      loopOp->setAttr(kPeeledLoopsLabel, rewriter.getI64ArrayAttr(peeledLoops));
-    });
-    result->setAttr(kPeeledLoopsLabel, rewriter.getI64ArrayAttr(peeledLoops));
-    result->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
-
-    return success();
-  }
-
-  /// Index of loop to peel.
-  int64_t idx;
-
-  /// If set to true, do not peel TiledLoopOps with a partial iteration.
-  bool skipPartial;
-};
-} // namespace
-
-static void applyTiledLoopPeelingPattern(FuncOp funcOp,
-                                         ArrayRef<unsigned> loops,
-                                         bool skipPartial) {
-  MLIRContext *ctx = funcOp.getContext();
-  RewritePatternSet patterns(ctx);
-  for (unsigned idx : loops)
-    patterns.add<TiledLoopPeelingPattern>(ctx, idx, skipPartial);
-  (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-
-  // Drop the markers.
-  funcOp.walk([](TiledLoopOp op) {
-    op->removeAttr(kPeeledLoopsLabel);
-    op->removeAttr(kPartialIterationLabel);
-  });
-}
-
 /// Apply transformations specified as patterns.
 void TestLinalgTransforms::runOnOperation() {
   auto lambda = [&](void *) {
@@ -739,9 +664,6 @@ void TestLinalgTransforms::runOnOperation() {
     return applyGeneralizePadTensorPatterns(getOperation());
   if (testSwapSubTensorPadTensor)
     return applyExtractSliceOfPadTensorSwapPattern(getOperation());
-  if (testTiledLoopPeeling.hasValue())
-    return applyTiledLoopPeelingPattern(getOperation(), testTiledLoopPeeling,
-                                        skipPartial);
   if (testTilePattern)
     return applyTilePattern(getOperation(), loopType, tileSizes, peeledLoops,
                             /*scalarizeDynamicDims=*/false);

diff  --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 704aab507883e..2bb690bede084 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -81,7 +81,6 @@ void registerTestGenericIRVisitorsPass();
 void registerTestGenericIRVisitorsInterruptPass();
 void registerTestInterfaces();
 void registerTestLinalgCodegenStrategy();
-void registerTestLinalgDistribution();
 void registerTestLinalgElementwiseFusion();
 void registerTestLinalgFusionTransforms();
 void registerTestLinalgTensorFusionTransforms();
@@ -171,7 +170,6 @@ void registerTestPasses() {
   mlir::test::registerTestGenericIRVisitorsPass();
   mlir::test::registerTestInterfaces();
   mlir::test::registerTestLinalgCodegenStrategy();
-  mlir::test::registerTestLinalgDistribution();
   mlir::test::registerTestLinalgElementwiseFusion();
   mlir::test::registerTestLinalgFusionTransforms();
   mlir::test::registerTestLinalgTensorFusionTransforms();