[Mlir-commits] [mlir] 0602e8f - [MLIR][Affine] Add parametric tile size support for affine.for tiling

Thu Sep 17 11:13:27 PDT 2020

Author: Navdeep Kumar
Date: 2020-09-17T23:39:14+05:30
New Revision: 0602e8f77f8662c85155b8cf02937a2e71c01e12

URL: https://github.com/llvm/llvm-project/commit/0602e8f77f8662c85155b8cf02937a2e71c01e12
DIFF: https://github.com/llvm/llvm-project/commit/0602e8f77f8662c85155b8cf02937a2e71c01e12.diff

LOG: [MLIR][Affine] Add parametric tile size support for affine.for tiling

Add support to tile affine.for ops with parametric sizes (i.e., SSA
values). Currently supports hyper-rectangular loop nests with constant
lower bounds only. Move methods

  - moveLoopBody(*)
  - getTileableBands(*)
  - checkTilingLegality(*)
  - tilePerfectlyNested(*)
  - constructTiledIndexSetHyperRect(*)

to allow reuse with constant tile size API. Add a test pass -test-affine
-parametric-tile to test parametric tiling.

Differential Revision: https://reviews.llvm.org/D87353

Added: 
    mlir/test/Dialect/Affine/loop-tiling-parametric.mlir
    mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp

Modified: 
    mlir/include/mlir/Transforms/LoopUtils.h
    mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
    mlir/lib/Transforms/Utils/LoopUtils.cpp
    mlir/test/lib/Transforms/CMakeLists.txt
    mlir/tools/mlir-opt/mlir-opt.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 5a0d46f5ba57..aaff786fbe2f 100644

--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -88,16 +88,28 @@ LLVM_NODISCARD
 LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
                                   bool unrollPrologueEpilogue = false);
 
+/// Identify valid and profitable bands of loops to tile. This is currently just
+/// a temporary placeholder to test the mechanics of tiled code generation.
+/// Returns all maximal outermost perfect loop nests to tile.
+void getTileableBands(FuncOp f,
+                      std::vector<SmallVector<AffineForOp, 6>> *bands);
+
 /// Tiles the specified band of perfectly nested loops creating tile-space loops
-/// and intra-tile loops. A band is a contiguous set of loops. `tiledNest` when
-/// non-null is set to the loops of the tiled nest from outermost to innermost.
-/// Loops in `input` are erased when the tiling is successful.
+/// and intra-tile loops. A band is a contiguous set of loops.
 LLVM_NODISCARD
 LogicalResult
 tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
                     ArrayRef<unsigned> tileSizes,
                     SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
 
+/// Tiles the specified band of perfectly nested loops creating tile-space
+/// loops and intra-tile loops, using SSA values as tiling parameters. A band
+/// is a contiguous set of loops.
+LLVM_NODISCARD
+LogicalResult tilePerfectlyNestedParametric(
+    MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
+    SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
+
 /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
 /// and 'forOpB' are part of a perfectly nested sequence of loops.
 void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);

diff  --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
index 5bded917978a..56469482c763 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -61,278 +61,6 @@ std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopTilingPass() {
   return std::make_unique<LoopTiling>();
 }
 
-// Move the loop body of AffineForOp 'src' from 'src' into the specified
-// location in destination's body, ignoring the terminator.
-static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
-                                Block::iterator loc) {
-  auto &insts = src.getBody()->getOperations();
-  dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
-                                         std::prev(insts.end()));
-}
-
-// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
-// body.
-static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
-  moveLoopBody(src, dest, dest.getBody()->begin());
-}
-
-/// Constructs and sets new loop bounds after tiling for the case of
-/// hyper-rectangular index sets, where the bounds of one dimension do not
-/// depend on other dimensions. Bounds of each dimension can thus be treated
-/// independently, and deriving the new bounds is much simpler and faster
-/// than for the case of tiling arbitrary polyhedral shapes.
-static void
-constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
-                                MutableArrayRef<AffineForOp> newLoops,
-                                ArrayRef<unsigned> tileSizes) {
-  assert(!origLoops.empty());
-  assert(origLoops.size() == tileSizes.size());
-
-  OpBuilder b(origLoops[0].getOperation());
-  unsigned width = origLoops.size();
-
-  // Bounds for tile space loops.
-  for (unsigned i = 0; i < width; i++) {
-    OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
-    OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
-    newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
-    newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
-    newLoops[i].setStep(tileSizes[i]);
-  }
-  // Bounds for intra-tile loops.
-  for (unsigned i = 0; i < width; i++) {
-    int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
-    auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
-    // The lower bound is just the tile-space loop.
-    AffineMap lbMap = b.getDimIdentityMap();
-    newLoops[width + i].setLowerBound(
-        /*operands=*/newLoops[i].getInductionVar(), lbMap);
-
-    // Set the upper bound.
-    if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
-      // Trip count is less than the tile size: upper bound is lower bound +
-      // trip count.
-      auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
-      newLoops[width + i].setUpperBound(
-          /*operands=*/newLoops[i].getInductionVar(), ubMap);
-    } else if (largestDiv % tileSizes[i] != 0) {
-      // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
-      // Construct the upper bound map; the operands are the original operands
-      // with 'i' (tile-space loop) appended to it. The new upper bound map is
-      // the original one with an additional expression i + tileSize appended.
-
-      // Add dim operands from original upper bound.
-      SmallVector<Value, 4> ubOperands;
-      auto ub = origLoops[i].getUpperBound();
-      ubOperands.reserve(ub.getNumOperands() + 1);
-      auto origUbMap = ub.getMap();
-      for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
-        ubOperands.push_back(ub.getOperand(j));
-
-      // Add dim operand for new loop upper bound.
-      ubOperands.push_back(newLoops[i].getInductionVar());
-
-      // Add symbol operands from original upper bound.
-      for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
-        ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
-
-      SmallVector<AffineExpr, 4> boundExprs;
-      boundExprs.reserve(1 + origUbMap.getNumResults());
-      auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
-      // The new upper bound map is the original one with an additional
-      // expression i + tileSize appended.
-      boundExprs.push_back(dim + tileSizes[i]);
-      boundExprs.append(origUbMap.getResults().begin(),
-                        origUbMap.getResults().end());
-      auto ubMap =
-          AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
-                         boundExprs, b.getContext());
-      newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
-    } else {
-      // No need of the min expression.
-      auto dim = b.getAffineDimExpr(0);
-      auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
-      newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
-    }
-  }
-}
-
-/// This function checks whether hyper-rectangular loop tiling of the nest
-/// represented by `origLoops` is valid. The validity condition is from Irigoin
-/// and Triolet, which states that two tiles cannot depend on each other. We
-/// simplify such condition to just checking whether there is any negative
-/// dependence direction, since we have the prior knowledge that the tiling
-/// results will be hyper-rectangles, which are scheduled in the
-/// lexicographically increasing order on the vector of loop indices. This
-/// function will return failure when any dependence component is negative along
-/// any of `origLoops`.
-static LogicalResult
-checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
-  assert(!origLoops.empty() && "no original loops provided");
-
-  // We first find out all dependences we intend to check.
-  SmallVector<Operation *, 8> loadAndStoreOps;
-  origLoops[0].getOperation()->walk([&](Operation *op) {
-    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
-      loadAndStoreOps.push_back(op);
-  });
-
-  unsigned numOps = loadAndStoreOps.size();
-  unsigned numLoops = origLoops.size();
-  FlatAffineConstraints dependenceConstraints;
-  for (unsigned d = 1; d <= numLoops + 1; ++d) {
-    for (unsigned i = 0; i < numOps; ++i) {
-      Operation *srcOp = loadAndStoreOps[i];
-      MemRefAccess srcAccess(srcOp);
-      for (unsigned j = 0; j < numOps; ++j) {
-        Operation *dstOp = loadAndStoreOps[j];
-        MemRefAccess dstAccess(dstOp);
-
-        SmallVector<DependenceComponent, 2> depComps;
-        dependenceConstraints.reset();
-        DependenceResult result = checkMemrefAccessDependence(
-            srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
-
-        // Skip if there is no dependence in this case.
-        if (!hasDependence(result))
-          continue;
-
-        // Check whether there is any negative direction vector in the
-        // dependence components found above, which means that dependence is
-        // violated by the default hyper-rect tiling method.
-        LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
-                                   "for dependence at depth: "
-                                << Twine(d) << " between:\n";);
-        LLVM_DEBUG(srcAccess.opInst->dump(););
-        LLVM_DEBUG(dstAccess.opInst->dump(););
-        for (unsigned k = 0, e = depComps.size(); k < e; k++) {
-          DependenceComponent depComp = depComps[k];
-          if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
-              depComp.lb.getValue() < depComp.ub.getValue() &&
-              depComp.ub.getValue() < 0) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "Dependence component lb = "
-                       << Twine(depComp.lb.getValue())
-                       << " ub = " << Twine(depComp.ub.getValue())
-                       << " is negative  at depth: " << Twine(d)
-                       << " and thus violates the legality rule.\n");
-            return failure();
-          }
-        }
-      }
-    }
-  }
-
-  return success();
-}
-/// Tiles the specified band of perfectly nested loops creating tile-space loops
-/// and intra-tile loops. A band is a contiguous set of loops.
-//  TODO: handle non hyper-rectangular spaces.
-LogicalResult
-mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
-                          ArrayRef<unsigned> tileSizes,
-                          SmallVectorImpl<AffineForOp> *tiledNest) {
-  // Check if the supplied for op's are all successively nested.
-  assert(!input.empty() && "no loops in input band");
-  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
-
-  assert(isPerfectlyNested(input) && "input loops not perfectly nested");
-
-  auto origLoops = input;
-
-  // Perform tiling legality test.
-  if (failed(checkTilingLegality(origLoops)))
-    origLoops[0].emitRemark("tiled code is illegal due to dependences");
-
-  AffineForOp rootAffineForOp = origLoops[0];
-  auto loc = rootAffineForOp.getLoc();
-  // Note that width is at least one since band isn't empty.
-  unsigned width = input.size();
-
-  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
-
-  // The outermost among the loops as we add more..
-  auto *topLoop = rootAffineForOp.getOperation();
-  AffineForOp innermostPointLoop;
-
-  // Add intra-tile (or point) loops.
-  for (unsigned i = 0; i < width; i++) {
-    OpBuilder b(topLoop);
-    // Loop bounds will be set later.
-    auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
-    pointLoop.getBody()->getOperations().splice(
-        pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
-        topLoop);
-    tiledLoops[2 * width - 1 - i] = pointLoop;
-    topLoop = pointLoop.getOperation();
-    if (i == 0)
-      innermostPointLoop = pointLoop;
-  }
-
-  // Add tile space loops;
-  for (unsigned i = width; i < 2 * width; i++) {
-    OpBuilder b(topLoop);
-    // Loop bounds will be set later.
-    auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
-    tileSpaceLoop.getBody()->getOperations().splice(
-        tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
-        topLoop);
-    tiledLoops[2 * width - i - 1] = tileSpaceLoop;
-    topLoop = tileSpaceLoop.getOperation();
-  }
-
-  // Move the loop body of the original nest to the new one.
-  moveLoopBody(origLoops.back(), innermostPointLoop);
-
-  SmallVector<Value, 8> origLoopIVs;
-  extractForInductionVars(input, &origLoopIVs);
-
-  FlatAffineConstraints cst;
-  SmallVector<Operation *, 8> ops;
-  ops.reserve(input.size());
-  for (AffineForOp forOp : input)
-    ops.push_back(forOp);
-  getIndexSet(ops, &cst);
-  if (!cst.isHyperRectangular(0, width)) {
-    rootAffineForOp.emitError("tiled code generation unimplemented for the "
-                              "non-hyperrectangular case");
-    return failure();
-  }
-
-  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
-
-  // Replace original IVs with intra-tile loop IVs.
-  for (unsigned i = 0; i < width; i++)
-    origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
-
-  // Erase the old loop nest.
-  rootAffineForOp.erase();
-
-  if (tiledNest)
-    *tiledNest = std::move(tiledLoops);
-
-  return success();
-}
-
-// Identify valid and profitable bands of loops to tile. This is currently just
-// a temporary placeholder to test the mechanics of tiled code generation.
-// Returns all maximal outermost perfect loop nests to tile.
-static void getTileableBands(FuncOp f,
-                             std::vector<SmallVector<AffineForOp, 6>> *bands) {
-  // Get maximal perfect nest of 'affine.for' insts starting from root
-  // (inclusive).
-  auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
-    SmallVector<AffineForOp, 6> band;
-    getPerfectlyNestedLoops(band, root);
-    bands->push_back(band);
-  };
-
-  for (auto &block : f)
-    for (auto &op : block)
-      if (auto forOp = dyn_cast<AffineForOp>(op))
-        getMaximalPerfectLoopNest(forOp);
-}
-
 /// Reduces each tile size to the largest divisor of the corresponding trip
 /// count (if the trip count is known).
 static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
@@ -340,7 +68,7 @@ static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
   assert(band.size() == tileSizes->size() && "invalid tile size count");
   for (unsigned i = 0, e = band.size(); i < e; i++) {
     unsigned &tSizeAdjusted = (*tileSizes)[i];
-    auto mayConst = getConstantTripCount(band[i]);
+    Optional<uint64_t> mayConst = getConstantTripCount(band[i]);
     if (!mayConst)
       continue;
     // Adjust the tile size to largest factor of the trip count less than
@@ -379,14 +107,14 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
   tileSizes->resize(band.size());
 
   // The first loop in the band.
-  auto rootForOp = band[0];
+  AffineForOp rootForOp = band[0];
   (void)rootForOp;
 
   // Obtain memory footprint and set tile sizes so that a tile fits in
   // the cache size. This is an approximation with the assumption that the
   // footprint increases with the tile size linearly in that dimension (i.e.,
   // assumes one-to-one access function).
-  auto fp = getMemoryFootprintBytes(band[0], 0);
+  Optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0);
   if (!fp) {
     // Fill with default tile sizes if footprint is unknown.
     std::fill(tileSizes->begin(), tileSizes->end(),
@@ -445,7 +173,7 @@ void LoopTiling::runOnFunction() {
     getTileSizes(band, &tileSizes);
     if (llvm::DebugFlag) {
       auto diag = band[0].emitRemark("using tile sizes [");
-      for (auto tSize : tileSizes)
+      for (unsigned tSize : tileSizes)
         diag << tSize << ' ';
       diag << "]\n";
     }

diff  --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 7ae45171ddbd..cf79e267fb8a 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -418,10 +418,559 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
   return success();
 }
 
-// Collect perfectly nested loops starting from `rootForOps`.  Loops are
-// perfectly nested if each loop is the first and only non-terminator operation
-// in the parent loop.  Collect at most `maxLoops` loops and append them to
-// `forOps`.
+/// Checks the legality of tiling of a hyper-rectangular loop nest by simply
+/// checking if there is a 'negative' dependence in the memrefs present in
+/// the loop nest. If yes then tiling is invalid.
+static bool
+checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
+  assert(!origLoops.empty() && "no original loops provided");
+
+  // We first find out all dependences we intend to check.
+  SmallVector<Operation *, 8> loadAndStoreOps;
+  origLoops[0].getOperation()->walk([&](Operation *op) {
+    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
+      loadAndStoreOps.push_back(op);
+  });
+
+  unsigned numOps = loadAndStoreOps.size();
+  unsigned numLoops = origLoops.size();
+  FlatAffineConstraints dependenceConstraints;
+  for (unsigned d = 1; d <= numLoops + 1; ++d) {
+    for (unsigned i = 0; i < numOps; ++i) {
+      Operation *srcOp = loadAndStoreOps[i];
+      MemRefAccess srcAccess(srcOp);
+      for (unsigned j = 0; j < numOps; ++j) {
+        Operation *dstOp = loadAndStoreOps[j];
+        MemRefAccess dstAccess(dstOp);
+
+        SmallVector<DependenceComponent, 2> depComps;
+        dependenceConstraints.reset();
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
+
+        // Skip if there is no dependence in this case.
+        if (!hasDependence(result))
+          continue;
+
+        // Check whether there is any negative direction vector in the
+        // dependence components found above, which means that dependence is
+        // violated by the default hyper-rect tiling method.
+        LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
+                                   "for dependence at depth: "
+                                << Twine(d) << " between:\n";);
+        LLVM_DEBUG(srcAccess.opInst->dump(););
+        LLVM_DEBUG(dstAccess.opInst->dump(););
+        for (unsigned k = 0, e = depComps.size(); k < e; k++) {
+          DependenceComponent depComp = depComps[k];
+          if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
+              depComp.lb.getValue() < depComp.ub.getValue() &&
+              depComp.ub.getValue() < 0) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Dependence component lb = "
+                       << Twine(depComp.lb.getValue())
+                       << " ub = " << Twine(depComp.ub.getValue())
+                       << " is negative  at depth: " << Twine(d)
+                       << " and thus violates the legality rule.\n");
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/// Checks whether hyper-rectangular loop tiling of the nest
+/// represented by `origLoops` is valid. The validity condition is from Irigoin
+/// and Triolet, which states that two tiles cannot depend on each other. We
+/// simplify such condition to just checking whether there is any negative
+/// dependence direction, since we have the prior knowledge that the tiling
+/// results will be hyper-rectangles, which are scheduled in the
+/// lexicographically increasing order on the vector of loop indices. This
+/// function will return failure when any dependence component is negative along
+/// any of `origLoops`.
+LogicalResult
+checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
+  return success(checkTilingLegalityImpl(origLoops));
+}
+
+/// Check if the input data is valid and wheter tiled code will be legal or not.
+template <typename t>
+void performPreTilingChecks(MutableArrayRef<AffineForOp> input,
+                            ArrayRef<t> tileSizes) {
+  // Check if the supplied for op's are all successively nested.
+  assert(!input.empty() && "no loops in input band");
+  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
+
+  assert(isPerfectlyNested(input) && "input loops not perfectly nested");
+
+  // Perform tiling legality test.
+  if (failed(checkTilingLegality(input)))
+    input[0].emitRemark("tiled code is illegal due to dependences");
+}
+
+/// Move the loop body of AffineForOp 'src' from 'src' into the specified
+/// location in destination's body, ignoring the terminator.
+static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
+                             Block::iterator loc) {
+  auto &ops = src.getBody()->getOperations();
+  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
+                                         std::prev(ops.end()));
+}
+
+/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
+/// body.
+void moveLoopBody(AffineForOp src, AffineForOp dest) {
+  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
+}
+
+/// Constructs tiled loop nest, without setting the loop bounds and move the
+/// body of the original loop nest to the tiled loop nest.
+void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
+                            AffineForOp rootAffineForOp, unsigned width,
+                            MutableArrayRef<AffineForOp> tiledLoops) {
+  Location loc = rootAffineForOp.getLoc();
+
+  // The outermost among the loops as we add more..
+  Operation *topLoop = rootAffineForOp.getOperation();
+  AffineForOp innermostPointLoop;
+
+  // Add intra-tile (or point) loops.
+  for (unsigned i = 0; i < width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
+    pointLoop.getBody()->getOperations().splice(
+        pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    tiledLoops[2 * width - 1 - i] = pointLoop;
+    topLoop = pointLoop.getOperation();
+    if (i == 0)
+      innermostPointLoop = pointLoop;
+  }
+
+  // Add tile space loops;
+  for (unsigned i = width; i < 2 * width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
+    tileSpaceLoop.getBody()->getOperations().splice(
+        tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    tiledLoops[2 * width - i - 1] = tileSpaceLoop;
+    topLoop = tileSpaceLoop.getOperation();
+  }
+
+  // Move the loop body of the original nest to the new one.
+  moveLoopBody(origLoops.back(), innermostPointLoop);
+}
+
+/// Checks whether a loop nest is hyper-rectangular or not.
+LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,
+                                      AffineForOp rootAffineForOp,
+                                      unsigned width) {
+  FlatAffineConstraints cst;
+  SmallVector<Operation *, 8> ops(input.begin(), input.end());
+  getIndexSet(ops, &cst);
+  if (!cst.isHyperRectangular(0, width)) {
+    rootAffineForOp.emitError("tiled code generation unimplemented for the "
+                              "non-hyperrectangular case");
+    return failure();
+  }
+  return success();
+}
+
+/// Set lower and upper bounds of intra-tile loops for parametric tiling.
+//  TODO: Handle non-constant lower bounds.
+static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
+                                         AffineForOp newInterTileLoop,
+                                         AffineForOp newIntraTileLoop,
+                                         Value tileSize) {
+  // The lower bound for the intra-tile loop is represented by an affine map
+  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
+  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
+  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
+  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
+  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
+  // corresponding inter-tile loop.
+
+  assert(origLoop.hasConstantLowerBound() &&
+         "expected input loops to have constant lower bound.");
+
+  // Get lower bound of original loop as an affine expression.
+  AffineExpr origLowerBoundExpr;
+  origLowerBoundExpr =
+      b.getAffineConstantExpr(origLoop.getConstantLowerBound());
+
+  // Add dim operands from original lower/upper bound.
+  SmallVector<Value, 4> lbOperands, ubOperands;
+  AffineBound lb = origLoop.getLowerBound();
+  AffineBound ub = origLoop.getUpperBound();
+  lbOperands.reserve(lb.getNumOperands() + 2);
+  ubOperands.reserve(ub.getNumOperands() + 2);
+  AffineMap origLbMap = lb.getMap();
+  AffineMap origUbMap = ub.getMap();
+  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
+    lbOperands.push_back(lb.getOperand(j));
+  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
+    ubOperands.push_back(ub.getOperand(j));
+
+  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
+  // IV.
+  lbOperands.push_back(newInterTileLoop.getInductionVar());
+  ubOperands.push_back(newInterTileLoop.getInductionVar());
+
+  // Get loop IV as an affine expression for lower/upper bound. Size of
+  // lb/ubOperands is guaranteed to be atleast one.
+  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
+  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
+
+  // Add symbol operands from original lower/upper bound.
+  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
+    lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
+  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
+    ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
+
+  // Add a new symbol operand which is the tile size for this loop.
+  lbOperands.push_back(tileSize);
+  ubOperands.push_back(tileSize);
+
+  SmallVector<AffineExpr, 4> lbBoundExprs;
+  SmallVector<AffineExpr, 4> ubBoundExprs;
+  lbBoundExprs.reserve(origLbMap.getNumResults());
+  ubBoundExprs.reserve(origUbMap.getNumResults());
+
+  // Get tiling parameter as an affine expression for lb/ub.
+  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
+  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
+
+  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
+  lbBoundExprs.push_back(
+      ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
+      origLowerBoundExpr);
+
+  // Get the origLoopStep as an affine expression.
+  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
+
+  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
+  // (tilingParameter * origLoopStep) + origlb.
+  ubBoundExprs.push_back(
+      ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
+      (ubTileParameter * origLoopStep) + origLowerBoundExpr);
+
+  ubBoundExprs.append(origUbMap.getResults().begin(),
+                      origUbMap.getResults().end());
+
+  AffineMap lbMap =
+      AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
+                     lbBoundExprs, b.getContext());
+  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
+
+  AffineMap ubMap =
+      AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
+                     ubBoundExprs, b.getContext());
+  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
+
+  // Original loop step must be preserved.
+  newIntraTileLoop.setStep(origLoop.getStep());
+}
+
+/// Set lower and upper bounds of inter-tile loops for parametric tiling.
+//  TODO: Handle non-constant lower bounds.
+static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
+                                         AffineForOp newLoop, Value tileSize) {
+  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
+
+  // The lower bounds for inter-tile loops are same as the correspondig lower
+  // bounds of original loops.
+  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
+
+  // The new upper bound map for inter-tile loops, assuming constant lower
+  // bounds, are now originalLowerBound + ceildiv((orignalUpperBound -
+  // originalLowerBound), tiling paramter); where tiling parameter is the
+  // respective tile size for that loop. For e.g. if the original ubmap was
+  // ()->(1024), the new map will be
+  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
+  // Therefore a new symbol operand is inserted in the map and the result
+  // expression is overwritten.
+
+  assert(origLoop.hasConstantLowerBound() &&
+         "expected input loops to have constant lower bound.");
+
+  // Get lower bound of original loop as an affine expression.
+  AffineExpr origLowerBoundExpr;
+  origLowerBoundExpr =
+      b.getAffineConstantExpr(origLoop.getConstantLowerBound());
+
+  // Add dim operands from original upper bound.
+  SmallVector<Value, 4> ubOperands;
+  AffineBound ub = origLoop.getUpperBound();
+  ubOperands.reserve(ub.getNumOperands() + 1);
+  AffineMap origUbMap = ub.getMap();
+  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
+    ubOperands.push_back(ub.getOperand(j));
+
+  // Add symbol operands from original upper bound.
+  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
+    ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
+
+  // Add a new symbol operand which is the tile size for this loop.
+  ubOperands.push_back(tileSize);
+
+  // Get tiling parameter as an affine expression.
+  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
+
+  SmallVector<AffineExpr, 4> boundExprs;
+  boundExprs.reserve(origUbMap.getNumResults());
+  int64_t origUpperBound;
+  AffineExpr origUpperBoundExpr;
+
+  // If upper bound for the original loop is constant, then the constant can
+  // be obtained as an affine expression straight away.
+  if (origLoop.hasConstantUpperBound()) {
+    origUpperBound = origLoop.getConstantUpperBound();
+
+    // Get original constant upper bound as an affine expression.
+    origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
+
+    // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
+    // originalLowerBound), tilingParameter).
+    boundExprs.push_back(
+        origLowerBoundExpr +
+        (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
+  } else {
+    // If upper bound for the original loop is not constant then two cases
+    // are possible, although there handeling is the same, 1.) The result of
+    // ubmap has only one result expression. For e.g.
+    //    affine.for %i = 5 to %ub
+    //
+    // A symbol operand is added which represents the tiling paramater. The
+    // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
+    // where 's0' is the original upper bound and 's1' is the tiling
+    // parameter. 2.) When ubMap has more than one result expression. For e.g.
+    //    #map0 = affine_map<()[s0, s1] -> (s0, s1)
+    //    affine.for %i = 5 to min #map0()[%s0, %s1]
+    //
+    // A symbol operand is added which represents the tiling parameter. The
+    // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
+    // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
+
+    // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
+    // originalLowerBound), tilingParameter).
+    for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
+      boundExprs.push_back(
+          origLowerBoundExpr +
+          (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
+  }
+
+  AffineMap ubMap =
+      AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
+                     boundExprs, b.getContext());
+  newLoop.setUpperBound(ubOperands, ubMap);
+
+  // Original loop step must be preserved.
+  newLoop.setStep(origLoop.getStep());
+}
+
+/// Constructs and sets new loop bounds after tiling for the case of
+/// hyper-rectangular index sets, where the bounds of one dimension do not
+/// depend on other dimensions and tiling parameters are captured from SSA
+/// values. Bounds of each dimension can thus be treated independently,
+/// and deriving the new bounds is much simpler and faster than for the case of
+/// tiling arbitrary polyhedral shapes.
+static void constructParametricallyTiledIndexSetHyperRect(
+    MutableArrayRef<AffineForOp> origLoops,
+    MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
+  assert(!origLoops.empty() && "expected atleast one loop in band");
+  assert(origLoops.size() == tileSizes.size() &&
+         "expected tiling parameter for each loop in band.");
+
+  OpBuilder b(origLoops[0].getOperation());
+  unsigned width = origLoops.size();
+
+  // Set bounds for tile space loops.
+  for (unsigned i = 0; i < width; ++i) {
+    setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
+  }
+
+  // Set bounds for intra-tile loops.
+  for (unsigned i = 0; i < width; ++i) {
+    setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
+                                 newLoops[i + width], tileSizes[i]);
+  }
+}
+
+/// Constructs and sets new loop bounds after tiling for the case of
+/// hyper-rectangular index sets, where the bounds of one dimension do not
+/// depend on other dimensions. Bounds of each dimension can thus be treated
+/// independently, and deriving the new bounds is much simpler and faster
+/// than for the case of tiling arbitrary polyhedral shapes.
+static void
+constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
+                                MutableArrayRef<AffineForOp> newLoops,
+                                ArrayRef<unsigned> tileSizes) {
+  assert(!origLoops.empty());
+  assert(origLoops.size() == tileSizes.size());
+
+  OpBuilder b(origLoops[0].getOperation());
+  unsigned width = origLoops.size();
+
+  // Bounds for tile space loops.
+  for (unsigned i = 0; i < width; i++) {
+    OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
+    OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
+    newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
+    newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
+    newLoops[i].setStep(tileSizes[i]);
+  }
+  // Bounds for intra-tile loops.
+  for (unsigned i = 0; i < width; i++) {
+    int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
+    Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
+    // The lower bound is just the tile-space loop.
+    AffineMap lbMap = b.getDimIdentityMap();
+    newLoops[width + i].setLowerBound(
+        /*operands=*/newLoops[i].getInductionVar(), lbMap);
+
+    // Set the upper bound.
+    if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
+      // Trip count is less than the tile size: upper bound is lower bound +
+      // trip count.
+      AffineMap ubMap =
+          b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
+      newLoops[width + i].setUpperBound(
+          /*operands=*/newLoops[i].getInductionVar(), ubMap);
+    } else if (largestDiv % tileSizes[i] != 0) {
+      // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
+      // Construct the upper bound map; the operands are the original operands
+      // with 'i' (tile-space loop) appended to it. The new upper bound map is
+      // the original one with an additional expression i + tileSize appended.
+
+      // Add dim operands from original upper bound.
+      SmallVector<Value, 4> ubOperands;
+      AffineBound ub = origLoops[i].getUpperBound();
+      ubOperands.reserve(ub.getNumOperands() + 1);
+      AffineMap origUbMap = ub.getMap();
+      for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
+        ubOperands.push_back(ub.getOperand(j));
+
+      // Add dim operand for new loop upper bound.
+      ubOperands.push_back(newLoops[i].getInductionVar());
+
+      // Add symbol operands from original upper bound.
+      for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
+        ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
+
+      SmallVector<AffineExpr, 4> boundExprs;
+      boundExprs.reserve(1 + origUbMap.getNumResults());
+      AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
+      // The new upper bound map is the original one with an additional
+      // expression i + tileSize appended.
+      boundExprs.push_back(dim + tileSizes[i]);
+      boundExprs.append(origUbMap.getResults().begin(),
+                        origUbMap.getResults().end());
+      AffineMap ubMap =
+          AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
+                         boundExprs, b.getContext());
+      newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
+    } else {
+      // No need of the min expression.
+      AffineExpr dim = b.getAffineDimExpr(0);
+      AffineMap ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
+      newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
+    }
+  }
+}
+
+/// Tiles the specified band of perfectly nested loops creating tile-space loops
+/// and intra-tile loops. A band is a contiguous set of loops.
+//  TODO: handle non hyper-rectangular spaces.
+LogicalResult
+mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
+                          ArrayRef<unsigned> tileSizes,
+                          SmallVectorImpl<AffineForOp> *tiledNest) {
+  performPreTilingChecks(input, tileSizes);
+
+  MutableArrayRef<AffineForOp> origLoops = input;
+  AffineForOp rootAffineForOp = origLoops[0];
+  // Note that width is at least one since band isn't empty.
+  unsigned width = input.size();
+  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
+
+  // Construct a tiled loop nest without setting their bounds. Bounds are
+  // set later.
+  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
+
+  SmallVector<Value, 8> origLoopIVs;
+  extractForInductionVars(input, &origLoopIVs);
+
+  if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
+    return failure();
+
+  // Set loop bounds for the tiled loop nest.
+  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
+
+  // Replace original IVs with intra-tile loop IVs.
+  for (unsigned i = 0; i < width; i++)
+    origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
+
+  // Erase the old loop nest.
+  rootAffineForOp.erase();
+
+  if (tiledNest)
+    *tiledNest = std::move(tiledLoops);
+
+  return success();
+}
+
+/// Tiles the specified band of perfectly nested loops creating tile-space
+/// loops and intra-tile loops, using SSA values as tiling parameters. A band
+/// is a contiguous set of loops.
+//  TODO: handle non hyper-rectangular spaces.
+LogicalResult
+mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
+                                    ArrayRef<Value> tileSizes,
+                                    SmallVectorImpl<AffineForOp> *tiledNest) {
+  performPreTilingChecks(input, tileSizes);
+
+  MutableArrayRef<AffineForOp> origLoops = input;
+  AffineForOp rootAffineForOp = origLoops[0];
+  // Note that width is at least one since band isn't empty.
+  unsigned width = input.size();
+  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
+
+  // Construct a tiled loop nest without setting their bounds. Bounds are
+  // set later.
+  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
+
+  SmallVector<Value, 8> origLoopIVs;
+  extractForInductionVars(input, &origLoopIVs);
+
+  if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
+    return failure();
+
+  // Set loop bounds for the tiled loop nest.
+  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
+                                                tileSizes);
+
+  // Replace original IVs with intra-tile loop IVs.
+  for (unsigned i = 0; i < width; i++)
+    origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
+
+  // Erase the old loop nest.
+  rootAffineForOp.erase();
+
+  if (tiledNest)
+    *tiledNest = std::move(tiledLoops);
+
+  return success();
+}
+
+/// Collect perfectly nested loops starting from `rootForOps`.  Loops are
+/// perfectly nested if each loop is the first and only non-terminator operation
+/// in the parent loop.  Collect at most `maxLoops` loops and append them to
+/// `forOps`.
 template <typename T>
 static void getPerfectlyNestedLoopsImpl(
     SmallVectorImpl<T> &forOps, T rootForOp,
@@ -452,6 +1001,20 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
   getPerfectlyNestedLoopsImpl(nestedLoops, root);
 }
 
+/// Identify valid and profitable bands of loops to tile. This is currently just
+/// a temporary placeholder to test the mechanics of tiled code generation.
+/// Returns all maximal outermost perfect loop nests to tile.
+void mlir::getTileableBands(FuncOp f,
+                            std::vector<SmallVector<AffineForOp, 6>> *bands) {
+  // Get maximal perfect nest of 'affine.for' insts starting from root
+  // (inclusive).
+  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
+    SmallVector<AffineForOp, 6> band;
+    getPerfectlyNestedLoops(band, forOp);
+    bands->push_back(band);
+  }
+}
+
 /// Unrolls this loop completely.
 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);

diff  --git a/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir
new file mode 100644
index 000000000000..5e9bc4a884c2
--- /dev/null
+++ b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir
@@ -0,0 +1,275 @@
+// RUN: mlir-opt %s -split-input-file -test-affine-parametric-tile | FileCheck %s
+// Test cases to test the utility introduced to tile affine for loops using
+// SSA values as tiling parameters(tile sizes). The tile sizes are expected
+// to be passed as input arguments(before any other argument) to the function
+// enclosing the loop nest. Currently hyper-rectangular loop nests with constant
+// lower bounds are supported.
+
+// -----
+
+// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 512)>
+// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 1024)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
+// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
+
+// CHECK: func @loop_tiling_3d([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
+// CHECK-NEXT:   affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]
+// CHECK-NEXT:     affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]
+// CHECK-NEXT:       affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]
+// CHECK-NEXT:         affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]
+// CHECK-NEXT:          affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]
+// CHECK-NEXT:            affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]
+// CHECK-NEXT:              "test.foo"(%[[I]], %[[J]], %[[K]])
+func @loop_tiling_3d(%t0 : index, %t1 : index, %t2 : index) {
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 512 {
+      affine.for %k = 0 to 1024 {
+        "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 4, 256)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 3, 512)>
+// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 2, 1024)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)>
+// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)>
+
+// CHECK: func @loop_tiling_non_unit_step([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index)
+// CHECK-NEXT:  affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}step 4
+// CHECK-NEXT:    affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]{{.*}} step 3
+// CHECK-NEXT:       affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]{{.*}} step 2
+// CHECK-NEXT:         affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} step 4
+// CHECK-NEXT:          affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} step 3
+// CHECK-NEXT:            affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} step 2
+// CHECK-NEXT:              "test.foo"(%[[I]], %[[J]], %[[K]])
+func @loop_tiling_non_unit_step(%t0: index, %t1: index, %t2: index){
+  affine.for %i = 0 to 256 step 4 {
+    affine.for %j = 0 to 512  step 3 {
+      affine.for %k = 0 to 1024 step 2 {
+        "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0, 4096 floordiv s1)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
+
+// CHECK: func @tile_loop_with_div_in_upper_bound([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
+#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
+func @tile_loop_with_div_in_upper_bound(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
+  %c0 = constant 0 : index
+  %M = dim %A, %c0 : memref<? x i32>
+  affine.for %i = 0 to min #ub()[%M, %U] {
+    addi %i, %i : index
+  }
+  // CHECK:  affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
+  // CHECK-NEXT:    affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]
+  // CHECK-NEXT:      addi %[[I]], %[[I]]
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2 * 4, s0, 4096 floordiv s1)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)>
+
+// CHECK: func @tile_loop_with_div_in_upper_bound_non_unit_step([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index)
+#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)>
+func @tile_loop_with_div_in_upper_bound_non_unit_step(%t5 : index, %A : memref<? x i32>, %L : index, %U : index) {
+  %c0 = constant 0 : index
+  %M = dim %A, %c0 : memref<? x i32>
+  affine.for %i = 0 to min #ub()[%M, %U] step 4 {
+    addi %i, %i : index
+  }
+  // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
+  // CHECK-NEXT:    affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]{{.*}} step 4{{.*}}
+  // CHECK-NEXT:      addi %[[I]], %[[I]]
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + 8)>
+// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 * 4 + 8, s0 + 16)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 + 8, s0 + 16)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + s0 + 8, 256)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 + 8) ceildiv s1 + 8)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (248 ceildiv s0 + 8)>
+
+// CHECK: func @tile_loop_with_non_zero_lb([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, %{{.*}}: index)
+// CHECK-NEXT:  affine.for [[ARG3:%arg[0-9+]]] = 8 to [[UBO0]]{{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:    affine.for [[ARG4:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:      affine.for [[ARG5:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG2]]{{.*}} step 4
+// CHECK-NEXT:        affine.for %[[I:.*]] = [[LBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:          affine.for %[[J:.*]] = [[LBI0]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]([[ARG4]]){{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:            affine.for %[[K:.*]] = [[LBI0]]([[ARG5]]){{.*}}[[ARG2]]{{.*}} to min [[UBI2]]([[ARG5]]){{.*}}[[ARG2]]{{.*}}step 4{{.*}}
+// CHECK-NEXT:              "test.foo"(%[[I]], %[[J]], %[[K]]) : (index, index, index) -> ()
+#ubi = affine_map<()[s0] -> (s0 + 16)>
+func @tile_loop_with_non_zero_lb(%t0: index, %t1: index, %t2: index, %U: index){
+  affine.for %i = 8 to 256 {
+    affine.for %j = 8 to #ubi()[%U] {
+      affine.for %k = 8 to #ubi()[%U] step 4 {
+        "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 250)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (250 ceildiv s0)>
+
+// CHECK: func @simple_matmul([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index{{.*}})
+// CHECK-NEXT:   affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:     affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:       affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG2]]{{.*}}
+// CHECK-NEXT:         affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:          affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:            affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI1]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}}
+// CHECK-NEXT:                 affine.load %{{.*}}[%[[I]], %[[K]]]
+// CHECK-NEXT:                 affine.load %{{.*}}[%[[K]], %[[J]]]
+// CHECK-NEXT:                 affine.load %{{.*}}[%[[I]], %[[J]]]
+// CHECK-NEXT:                 mulf %{{.*}}
+// CHECK-NEXT:                 addf %{{.*}}
+// CHECK-NEXT:                 affine.store %{{.*}}[%[[I]], %[[J]]]
+func @simple_matmul(%t6 : index, %t7 : index, %t8 : index, %arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> {
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      affine.for %k = 0 to 250 {
+        %l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>>
+        %r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>>
+        %o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>>
+        %m = mulf %l, %r : vector<64xf32>
+        %a = addf %o, %m : vector<64xf32>
+        affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>>
+      }
+    }
+  }
+  return %arg2 : memref<256x256xvector<64xf32>>
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s1, s0)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
+
+// CHECK: func @tile_with_symbolic_loop_upper_bounds([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index{{.*}}){{.*}}
+// CHECK:        affine.for [[ARG2:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:     affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:       affine.for %[[I0:.*]] = [[LBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:         affine.for %[[I1:.*]] = [[LBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}}
+// CHECK-NEXT:           affine.store %{{.*}}, %{{.*}}[%[[I0]], %[[I1]]] : memref<?x?xf32>
+// CHECK-NEXT:           affine.for %[[I2:.*]] = 0 to %{{.*}} {
+// CHECK-NEXT:             affine.load %{{.*}}%[[I0]], %[[I2]]
+// CHECK-NEXT:             affine.load %{{.*}}%[[I2]], %[[I1]]
+// CHECK-NEXT:             mulf
+// CHECK-NEXT:             affine.load %{{.*}}%[[I0]], %[[I1]]
+// CHECK-NEXT:             addf
+// CHECK-NEXT:             affine.store %{{.*}}%[[I0]], %[[I1]]
+func @tile_with_symbolic_loop_upper_bounds(%t9 : index, %t10: index, %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  %c0 = constant 0 : index
+  %0 = dim %arg0, %c0 : memref<?x?xf32>
+  affine.for %i0 = 0 to %0 {
+    affine.for %i1 = 0 to %0 {
+      affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+      affine.for %i2 = 0 to %0 {
+        %1 = affine.load %arg0[%i0, %i2] : memref<?x?xf32>
+        %2 = affine.load %arg1[%i2, %i1] : memref<?x?xf32>
+        %3 = mulf %1, %2 : f32
+        %4 = affine.load %arg2[%i0, %i1] : memref<?x?xf32>
+        %5 = addf %4, %3 : f32
+        affine.store %5, %arg2[%i0, %i1] : memref<?x?xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0 + s1)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> ((s0 + s1) ceildiv s2)>
+
+// CHECK: func @tile_with_loop_upper_bounds_in_two_symbols([[ARG0:%arg[0-9]+]]: index{{.*}}){{.*}}
+func @tile_with_loop_upper_bounds_in_two_symbols(%t11 : index, %arg0: memref<?xf32>, %limit: index) {
+  %c0 = constant 0 : index
+  %dim0 = dim %arg0, %c0 : memref<?xf32>
+  affine.for %i0 = 0 to affine_map<()[s0, s1] -> (s0 + s1)> ()[%dim0, %limit] {
+    %v0 = affine.load %arg0[%i0] : memref<?xf32>
+  }
+  // CHECK:  affine.for [[ARG1:%arg[0-9]+]] = 0 to [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]
+  // CHECK-NEXT:    affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG1]])[{{.*}}, {{.*}}, [[ARG0]]]
+  // CHECK-NEXT:      affine.load %{{.*}}[%[[I]]]
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 4)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 2)>
+// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
+
+// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
+// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]
+// CHECK-NEXT:   affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]
+// CHECK-NEXT:     affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}}
+// CHECK-NEXT:       affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}}
+func @tile_with_upper_bounds_in_dimensions_and_symbols(%t12 : index, %t13 :index, %M: index, %N:  index, %K: index) {
+  affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] {
+    affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] {
+      "test.foo" () : () -> ()
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 4, d0 + s0 + 4)>
+// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 2, d0 + s0 + 2)>
+// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)>
+// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)>
+// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)>
+
+// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps
+// CHECK-SAME: ([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}}
+// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
+// CHECK-NEXT:   affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
+// CHECK-NEXT:     affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} step 2{{.*}}
+// CHECK-NEXT:       affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} step 4{{.*}}
+func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps(%t12 : index, %t13 :index, %M: index, %N :  index, %K: index) {
+  affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] step 2 {
+    affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] step 4 {
+      "test.foo" () : () -> ()
+    }
+  }
+  return
+}

diff  --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 3ac1e7c55235..99424f1c9c06 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestAllReduceLowering.cpp
+  TestAffineLoopParametricTiling.cpp
   TestBufferPlacement.cpp
   TestExpandTanh.cpp
   TestCallGraph.cpp

diff  --git a/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp
new file mode 100644
index 000000000000..5d369e62ae43
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp
@@ -0,0 +1,90 @@
+//= TestAffineLoopParametricTiling.cpp -- Parametric Affine loop tiling pass =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a test pass to test parametric tiling of perfectly
+// nested affine for loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "test-affine-parametric-tile"
+
+namespace {
+
+struct TestAffineLoopParametricTiling
+    : public PassWrapper<TestAffineLoopParametricTiling, FunctionPass> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+/// Checks if the function enclosing the loop nest has any arguments passed to
+/// it, which can be used as tiling parameters. Assumes that atleast 'n'
+/// arguments are passed, where 'n' is the number of loops in the loop nest.
+static void checkIfTilingParametersExist(ArrayRef<AffineForOp> band) {
+  assert(!band.empty() && "no loops in input band");
+  AffineForOp topLoop = band[0];
+
+  if (FuncOp funcOp = dyn_cast<FuncOp>(topLoop.getParentOp()))
+    assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes");
+}
+
+/// Captures tiling parameters, which are expected to be passed as arguments
+/// to the function enclosing the loop nest. Also checks if the required
+/// parameters are of index type. This approach is temporary for testing
+/// purposes.
+static void getTilingParameters(ArrayRef<AffineForOp> band,
+                                SmallVectorImpl<Value> &tilingParameters) {
+  AffineForOp topLoop = band[0];
+  Region *funcOpRegion = topLoop.getParentRegion();
+  unsigned nestDepth = band.size();
+
+  for (BlockArgument blockArgument :
+       funcOpRegion->getArguments().take_front(nestDepth)) {
+    if (blockArgument.getArgNumber() < nestDepth) {
+      assert(blockArgument.getType().isIndex() &&
+             "expected tiling parameters to be of index type.");
+      tilingParameters.push_back(blockArgument);
+    }
+  }
+}
+
+void TestAffineLoopParametricTiling::runOnFunction() {
+  // Bands of loops to tile.
+  std::vector<SmallVector<AffineForOp, 6>> bands;
+  getTileableBands(getFunction(), &bands);
+
+  // Tile each band.
+  for (SmallVectorImpl<AffineForOp> &band : bands) {
+    // Capture the tiling parameters from the arguments to the function
+    // enclosing this loop nest.
+    SmallVector<AffineForOp, 6> tiledNest;
+    SmallVector<Value, 6> tilingParameters;
+    // Check if tiling parameters are present.
+    checkIfTilingParametersExist(band);
+
+    // Get function arguments as tiling parameters.
+    getTilingParameters(band, tilingParameters);
+
+    if (failed(
+            tilePerfectlyNestedParametric(band, tilingParameters, &tiledNest)))
+      return signalPassFailure();
+  }
+}
+
+namespace mlir {
+void registerTestAffineLoopParametricTilingPass() {
+  PassRegistration<TestAffineLoopParametricTiling>(
+      "test-affine-parametric-tile",
+      "Tile affine loops using SSA values as tile sizes");
+}
+} // namespace mlir

diff  --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index e46327aa6399..93934d40fe59 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -41,6 +41,7 @@ void registerSimpleParametricTilingPass();
 void registerSliceAnalysisTestPass();
 void registerSymbolTestPasses();
 void registerTestAffineDataCopyPass();
+void registerTestAffineLoopParametricTilingPass();
 void registerTestAffineLoopUnswitchingPass();
 void registerTestAllReduceLoweringPass();
 void registerTestBufferPlacementPreparationPass();
@@ -104,6 +105,7 @@ void registerTestPasses() {
 #if MLIR_ROCM_CONVERSIONS_ENABLED
   registerTestConvertGPUKernelToHsacoPass();
 #endif
+  registerTestAffineLoopParametricTilingPass();
   registerTestBufferPlacementPreparationPass();
   registerTestDominancePass();
   registerTestFunc();