[Mlir-commits] [mlir] [mlir][SCF] Use Affine ops for indexing math. (PR #108450)

Thu Sep 12 23:27:14 PDT 2024

https://github.com/MaheshRavishankar updated https://github.com/llvm/llvm-project/pull/108450

>From 12811be4db909fe4c040405343fb089c9594175e Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <mahesh.ravishankar at gmail.com>
Date: Thu, 12 Sep 2024 13:37:02 -0700
Subject: [PATCH] [mlir][SCF] Use Affine ops for indexing math.

For index type of induction variable, the indexing math is better
represented using affine ops such as `affine.delinearize_index`.

This also further demonstrates that some of these `affine` ops might
need to move to a different dialect. For one these ops only support
`IndexType` when they should be able to work with any integer type.

Signed-off-by: MaheshRavishankar <mahesh.ravishankar at gmail.com>
---
 mlir/include/mlir/Dialect/Affine/Passes.td    |   2 +-
 .../mlir/Dialect/SCF/Transforms/Passes.td     |   1 +
 .../SCF/Transforms/ParallelLoopCollapsing.cpp |   1 +
 mlir/lib/Dialect/SCF/Utils/Utils.cpp          |  75 ++++-
 mlir/test/Dialect/Affine/loop-coalescing.mlir | 262 ++++++++----------
 .../Dialect/SCF/transform-op-coalesce.mlir    |  73 ++---
 .../Transforms/parallel-loop-collapsing.mlir  |   7 +-
 .../single-parallel-loop-collapsing.mlir      |  15 +-
 8 files changed, 230 insertions(+), 206 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 1036e93a039240..b08e803345f76e 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -394,7 +394,7 @@ def LoopCoalescing : Pass<"affine-loop-coalescing", "func::FuncOp"> {
   let summary = "Coalesce nested loops with independent bounds into a single "
                 "loop";
   let constructor = "mlir::affine::createLoopCoalescingPass()";
-  let dependentDialects = ["arith::ArithDialect"];
+  let dependentDialects = ["affine::AffineDialect","arith::ArithDialect"];
 }
 
 def SimplifyAffineStructures : Pass<"affine-simplify-structures", "func::FuncOp"> {
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
index 9b29affb97c432..53d1ae10dc87d8 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
@@ -56,6 +56,7 @@ def SCFParallelLoopFusion : Pass<"scf-parallel-loop-fusion"> {
 def TestSCFParallelLoopCollapsing : Pass<"test-scf-parallel-loop-collapsing"> {
   let summary = "Test parallel loops collapsing transformation";
   let constructor = "mlir::createTestSCFParallelLoopCollapsingPass()";
+  let dependentDialects = ["affine::AffineDialect"];
   let description = [{
       This pass is purely for testing the scf::collapseParallelLoops
       transformation. The transformation does not have opinions on how a
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
index 6ba7020e86fa67..358a3b38a4cd32 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Transforms/RegionUtils.h"
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index a794a121d6267b..2b643893ef46d2 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -671,9 +672,26 @@ LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
   return success();
 }
 
+Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc,
+                                           OpFoldResult lb, OpFoldResult ub,
+                                           OpFoldResult step) {
+  Range normalizedLoopBounds;
+  normalizedLoopBounds.offset = rewriter.getIndexAttr(0);
+  normalizedLoopBounds.stride = rewriter.getIndexAttr(1);
+  AffineExpr s0, s1, s2;
+  bindSymbols(rewriter.getContext(), s0, s1, s2);
+  AffineExpr e = (s1 - s0).ceilDiv(s2);
+  normalizedLoopBounds.size =
+      affine::makeComposedFoldedAffineApply(rewriter, loc, e, {lb, ub, step});
+  return normalizedLoopBounds;
+}
+
 Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
                                      OpFoldResult lb, OpFoldResult ub,
                                      OpFoldResult step) {
+  if (getType(lb) == rewriter.getIndexType()) {
+    return emitNormalizedLoopBoundsForIndexType(rewriter, loc, lb, ub, step);
+  }
   // For non-index types, generate `arith` instructions
   // Check if the loop is already known to have a constant zero lower bound or
   // a constant one step.
@@ -714,9 +732,35 @@ Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
   return {newLowerBound, newUpperBound, newStep};
 }
 
+static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter,
+                                                     Location loc,
+                                                     Value normalizedIv,
+                                                     OpFoldResult origLb,
+                                                     OpFoldResult origStep) {
+  AffineExpr d0, s0, s1;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  bindDims(rewriter.getContext(), d0);
+  AffineExpr e = d0 * s1 + s0;
+  OpFoldResult denormalizedIv = affine::makeComposedFoldedAffineApply(
+      rewriter, loc, e, ArrayRef<OpFoldResult>{normalizedIv, origLb, origStep});
+  Value denormalizedIvVal =
+      getValueOrCreateConstantIndexOp(rewriter, loc, denormalizedIv);
+  SmallPtrSet<Operation *, 1> preservedUses;
+  if (!isConstantIntValue(origLb, 0) || !isConstantIntValue(origStep, 1)) {
+    if (Operation *preservedUse = denormalizedIvVal.getDefiningOp()) {
+      preservedUses.insert(preservedUse);
+    }
+  }
+  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIvVal, preservedUses);
+}
+
 void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
                                         Value normalizedIv, OpFoldResult origLb,
                                         OpFoldResult origStep) {
+  if (getType(origLb) == rewriter.getIndexType()) {
+    return denormalizeInductionVariableForIndexType(rewriter, loc, normalizedIv,
+                                                    origLb, origStep);
+  }
   Value denormalizedIv;
   SmallPtrSet<Operation *, 2> preserve;
   bool isStepOne = isConstantIntValue(origStep, 1);
@@ -739,10 +783,29 @@ void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
   rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
 }
 
+static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc,
+                                        ArrayRef<OpFoldResult> values) {
+  assert(!values.empty() && "unexecpted empty array");
+  AffineExpr s0, s1;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  AffineExpr mul = s0 * s1;
+  OpFoldResult products = rewriter.getIndexAttr(1);
+  for (auto v : values) {
+    products = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, mul, ArrayRef<OpFoldResult>{products, v});
+  }
+  return products;
+}
+
 /// Helper function to multiply a sequence of values.
 static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
                                        ArrayRef<Value> values) {
   assert(!values.empty() && "unexpected empty list");
+  if (getType(values.front()) == rewriter.getIndexType()) {
+    SmallVector<OpFoldResult> ofrs = getAsOpFoldResult(values);
+    OpFoldResult product = getProductOfIndexes(rewriter, loc, ofrs);
+    return getValueOrCreateConstantIndexOp(rewriter, loc, product);
+  }
   std::optional<Value> productOf;
   for (auto v : values) {
     auto vOne = getConstantIntValue(v);
@@ -757,7 +820,7 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
   if (!productOf) {
     productOf = rewriter
                     .create<arith::ConstantOp>(
-                        loc, rewriter.getOneAttr(values.front().getType()))
+                        loc, rewriter.getOneAttr(getType(values.front())))
                     .getResult();
   }
   return productOf.value();
@@ -774,6 +837,16 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
 delinearizeInductionVariable(RewriterBase &rewriter, Location loc,
                              Value linearizedIv, ArrayRef<Value> ubs) {
+
+  if (linearizedIv.getType() == rewriter.getIndexType()) {
+    Operation *delinearizedOp =
+        rewriter.create<affine::AffineDelinearizeIndexOp>(loc, linearizedIv,
+                                                          ubs);
+    auto resultVals = llvm::map_to_vector(
+        delinearizedOp->getResults(), [](OpResult r) -> Value { return r; });
+    return {resultVals, SmallPtrSet<Operation *, 2>{delinearizedOp}};
+  }
+
   SmallVector<Value> delinearizedIvs(ubs.size());
   SmallPtrSet<Operation *, 2> preservedUsers;
 
diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir
index 45dd299295f640..f6e7b21bc66aba 100644
--- a/mlir/test/Dialect/Affine/loop-coalescing.mlir
+++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir
@@ -1,14 +1,15 @@
-// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse --mlir-print-local-scope %s | FileCheck %s
 
 // CHECK-LABEL: @one_3d_nest
 func.func @one_3d_nest() {
   // Capture original bounds.  Note that for zero-based step-one loops, the
   // upper bound is also the number of iterations.
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -16,9 +17,6 @@ func.func @one_3d_nest() {
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
-
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]]
   scf.for %i = %c0 to %c42 step %c1 {
@@ -26,13 +24,11 @@ func.func @one_3d_nest() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     scf.for %j = %c0 to %c56 step %c1 {
       scf.for %k = %c0 to %c3 step %c1 {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
       }
     }
@@ -40,6 +36,8 @@ func.func @one_3d_nest() {
   return
 }
 
+// -----
+
 // Check that there is no chasing the replacement of value uses by ensuring
 // multiple uses of loop induction variables get rewritten to the same values.
 
@@ -52,13 +50,10 @@ func.func @multi_use() {
   scf.for %i = %c1 to %c10 step %c1 {
     scf.for %j = %c1 to %c10 step %c1 {
       scf.for %k = %c1 to %c10 step %c1 {
-        // CHECK: %[[k_unshifted:.*]] = arith.remsi %[[iv]], %[[k_extent:.*]]
-        // CHECK: %[[ij:.*]] = arith.divsi %[[iv]], %[[k_extent]]
-        // CHECK: %[[j_unshifted:.*]] = arith.remsi %[[ij]], %[[j_extent:.*]]
-        // CHECK: %[[i_unshifted:.*]] = arith.divsi %[[ij]], %[[j_extent]]
-        // CHECK: %[[k:.*]] = arith.addi %[[k_unshifted]]
-        // CHECK: %[[j:.*]] = arith.addi %[[j_unshifted]]
-        // CHECK: %[[i:.*]] = arith.addi %[[i_unshifted]]
+      	// CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[iv]]
+        // CHECK: %[[k:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#2)
+        // CHECK: %[[j:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#1)
+        // CHECK: %[[i:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#0)
 
         // CHECK: "use1"(%[[i]], %[[j]], %[[k]])
         "use1"(%i,%j,%k) : (index,index,index) -> ()
@@ -72,12 +67,20 @@ func.func @multi_use() {
   return
 }
 
+// -----
+
 func.func @unnormalized_loops() {
-  // CHECK: %[[orig_step_i:.*]] = arith.constant 2
+  // Normalized lower bound and step for the outer scf.
+  // CHECK-DAG: %[[lb_i:.*]] = arith.constant 0
+  // CHECK-DAG: %[[step_i:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3
+
+  // Number of iterations in the inner loop, the pattern is the same as above,
+  // only capture the final result.
+  // CHECK-DAG: %[[numiter_j:.*]] = arith.constant 4
+
+  // CHECK-DAG: %[[range:.*]] = arith.constant 12
 
-  // CHECK: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3
-  // CHECK: %[[orig_lb_i:.*]] = arith.constant 5
-  // CHECK: %[[orig_lb_j:.*]] = arith.constant 7
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   %c5 = arith.constant 5 : index
@@ -85,28 +88,18 @@ func.func @unnormalized_loops() {
   %c10 = arith.constant 10 : index
   %c17 = arith.constant 17 : index
 
-  // Normalized lower bound and step for the outer scf.
-  // CHECK: %[[lb_i:.*]] = arith.constant 0
-  // CHECK: %[[step_i:.*]] = arith.constant 1
-
-  // Number of iterations in the inner loop, the pattern is the same as above,
-  // only capture the final result.
-  // CHECK: %[[numiter_j:.*]] = arith.constant 4
 
   // New bounds of the outer scf.
-  // CHECK: %[[range:.*]] = arith.muli %[[orig_step_j_and_numiter_i:.*]], %[[numiter_j]]
   // CHECK: scf.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
   scf.for %i = %c5 to %c10 step %c2 {
     // The inner loop has been removed.
     // CHECK-NOT: scf.for
     scf.for %j = %c7 to %c17 step %c3 {
       // The IVs are rewritten.
-      // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter_j]]
-      // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter_j]]
-      // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step_j_and_numiter_i]]
-      // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb_j]]
-      // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step_i]]
-      // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb_i]]
+      // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]]
+      // CHECK-SAME: into (%[[orig_step_j_and_numiter_i]], %[[numiter_j]])
+      // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 7)>(%[[delinearize]]#1)
+      // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0) -> (d0 * 2 + 5)>(%[[delinearize]]#0)
       // CHECK: "use"(%[[orig_i]], %[[orig_j]])
       "use"(%i, %j) : (index, index) -> ()
     }
@@ -114,20 +107,21 @@ func.func @unnormalized_loops() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_yielded_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) {
@@ -136,13 +130,10 @@ func.func @noramalized_loops_with_yielded_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){
       %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
         // CHECK: scf.yield %[[VAL_1]] : index
         scf.yield %arg2 : index
@@ -154,20 +145,21 @@ func.func @noramalized_loops_with_yielded_iter_args() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
+  // CHECK-DAG:%[[range:.*]] = arith.constant 7056
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]], %[[VAL_2:.*]] = %[[orig_lb]]) -> (index, index) {
@@ -176,13 +168,11 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:2 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (index, index){
       %0:2 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (index, index) {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
         // CHECK: scf.yield %[[VAL_2]], %[[VAL_1]] : index, index
         scf.yield %arg5, %arg4 : index, index
@@ -194,20 +184,21 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_yielded_non_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) {
@@ -216,13 +207,11 @@ func.func @noramalized_loops_with_yielded_non_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){
       %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) {
-        // CHECK: %[[res:.*]] = "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: %[[res:.*]] = "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         %res = "use"(%i, %j, %k) : (index, index, index) -> (index)
         // CHECK: scf.yield %[[res]] : index
         scf.yield %res : index
@@ -234,6 +223,8 @@ func.func @noramalized_loops_with_yielded_non_iter_args() {
   return
 }
 
+// -----
+
 // Check with parametric loop bounds and steps, capture the bounds here.
 // CHECK-LABEL: @parametric
 // CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]:
@@ -246,25 +237,28 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
                  %lb2 : index, %ub2 : index, %step2 : index) {
   // Compute the number of iterations for each of the loops and the total
   // number of iterations.
-  // CHECK: %[[range1:.*]] = arith.subi %[[orig_ub1]], %[[orig_lb1]]
-  // CHECK: %[[numiter1:.*]] = arith.ceildivsi %[[range1]], %[[orig_step1]]
-  // CHECK: %[[range2:.*]] = arith.subi %[[orig_ub2]], %[[orig_lb2]]
-  // CHECK: %[[numiter2:.*]] = arith.ceildivsi %[[range2]], %[[orig_step2]]
-  // CHECK: %[[range:.*]] = arith.muli %[[numiter1]], %[[numiter2]] : index
+  // CHECK: %[[normalized_i:.*]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]]]
+  // CHECK: %[[c0:.+]] = arith.constant 0
+  // CHECK: %[[c1:.+]] = arith.constant 1
+  // CHECK: %[[normalized_j:.*]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]]
+  // CHECK: %[[range:.+]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2, s3, s4, s5] -> (((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5))>()
+  // CHECK-SAME: [%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]], %[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]]
 
   // Check that the outer loop is updated.
-  // CHECK: scf.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1
+  // CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[range]] step %[[c1]]
   scf.for %i = %lb1 to %ub1 step %step1 {
     // Check that the inner loop is removed.
     // CHECK-NOT: scf.for
     scf.for %j = %lb2 to %ub2 step %step2 {
       // Remapping of the induction variables.
-      // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter2]] : index
-      // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter2]] : index
-      // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step2]]
-      // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb2]]
-      // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step1]]
-      // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb1]]
+      // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]] into (%[[normalized_i]], %[[normalized_j]])
+      // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+      // CHECK-SAME: (%[[delinearize]]#1)[%[[orig_lb2]], %[[orig_step2]]]
+      // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+      // CHECK-SAME: (%[[delinearize]]#0)[%[[orig_lb1]], %[[orig_step1]]]
 
       // CHECK: "foo"(%[[orig_i]], %[[orig_j]])
       "foo"(%i, %j) : (index, index) -> ()
@@ -273,19 +267,21 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
   return
 }
 
+// -----
+
 // CHECK-LABEL: @two_bands
 func.func @two_bands() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
-  // CHECK: %[[outer_range:.*]] = arith.muli
+  // CHECK: %[[outer_range:.*]] = arith.constant 100
   // CHECK: scf.for %{{.*}} = %{{.*}} to %[[outer_range]]
   scf.for %i = %c0 to %c10 step %c1 {
     // Check that the "j" loop was removed and that the inner loops were
     // coalesced as well.  The preparation step for coalescing will inject the
     // subtraction operation unlike the IV remapping.
     // CHECK-NOT: scf.for
-    // CHECK: arith.subi
+    // CHECK: affine.delinearize_index
     scf.for %j = %c0 to %c10 step %c1 {
       // The inner pair of loops is coalesced separately.
       // CHECK: scf.for
@@ -303,12 +299,6 @@ func.func @two_bands() {
 // -----
 
 // Check coalescing of affine.for loops when all the loops have constant upper bound.
-// CHECK-DAG: #[[SIXTEEN:.*]] = affine_map<() -> (16)>
-// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[EIGHT:.*]] = affine_map<() -> (8)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for() {
   affine.for %i = 0 to 16 {
     affine.for %j = 0 to 64 {
@@ -319,16 +309,16 @@ func.func @coalesce_affine_for() {
   }
   return
 }
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[SIXTEEN]]()
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[SIXTY_FOUR]]()
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[EIGHT]]()
-// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<() -> (16)>()
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<() -> (64)>()
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<() -> (8)>()
+// CHECK-DAG: %[[T4:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T3]]]
 // CHECK:       affine.for %[[IV:.*]] = 0 to %[[T4]]
-// CHECK-DAG:    %[[K:.*]] =  affine.apply #[[MOD]](%[[IV]])[%[[T3]]]
-// CHECK-DAG:    %[[T6:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T3]]]
-// CHECK-DAG:    %[[J:.*]] =  affine.apply #[[MOD]](%[[T6]])[%[[T1]]]
-// CHECK-DAG:    %[[I:.*]] =  affine.apply #[[DIV]](%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[K:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[T6:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[J:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T6]])[%[[T1]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -336,10 +326,6 @@ func.func @coalesce_affine_for() {
 // -----
 
 // Check coalescing of affine.for loops when all the loops have non constant upper bounds.
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %M = memref.dim %arg0, %c0 : memref<?x?xf32>
@@ -355,14 +341,14 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T0]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]]
-// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T9]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T9]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -370,11 +356,6 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
 // -----
 
 // Check coalescing of affine.for loops when some of the loop has constant upper bounds while others have nin constant upper bounds.
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %M = memref.dim %arg0, %c0 : memref<?x?xf32>
@@ -389,15 +370,15 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]()
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<() -> (64)>()
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T2]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]]
-// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -405,11 +386,6 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
 // -----
 
 // Check coalescing of affine.for loops when upper bound contains multi result upper bound map.
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)>
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 #myMap = affine_map<()[s1] -> (s1, -s1)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
  %c0 = arith.constant 0 : index
@@ -426,23 +402,21 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
  return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]]
+// CHECK-DAG: %[[T0:.*]] = affine.min affine_map<()[s0] -> (s0, -s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T1]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]]
-// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T1]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
 
 // -----
 
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 110)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (696, d0 * 110 + 110)>
 #map0 = affine_map<(d0) -> (d0 * 110)>
 #map1 = affine_map<(d0) -> (696, d0 * 110 + 110)>
 func.func @test_loops_do_not_get_coalesced() {
@@ -454,7 +428,7 @@ func.func @test_loops_do_not_get_coalesced() {
   return
 }
 // CHECK: affine.for %[[IV0:.*]] = 0 to 7
-// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]])
+// CHECK-NEXT: affine.for %[[IV1:.*]] = affine_map<(d0) -> (d0 * 110)>(%[[IV0]]) to min affine_map<(d0) -> (696, d0 * 110 + 110)>(%[[IV0]])
 // CHECK-NEXT:   "use"(%[[IV0]], %[[IV1]])
 // CHECK-NEXT: }
 // CHECK-NEXT: }
diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
index 6fcd727621bae9..1c405a47950fcc 100644
--- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
+++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse --mlir-print-local-scope | FileCheck %s
 
 func.func @coalesce_inner() {
   %c0 = arith.constant 0 : index
@@ -33,19 +33,15 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} {
-  // CHECK: %[[T0:.+]] = affine.apply #[[MAP]]()
-  // CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]]
+  // CHECK: %[[T0:.+]] = affine.apply affine_map<() -> (64)>()
+  // CHECK: %[[UB:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
   // CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] {
   // CHECK-NOT: affine.for %[[IV2:.+]]
   affine.for %arg4 = 0 to 64 {
     affine.for %arg5 = 0 to 64 {
-      // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}]
-      // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX0:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX1:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV1]])[%{{.+}}]
       // CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
       %0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
       %1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
@@ -76,9 +72,8 @@ func.func @coalesce_and_unroll(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64x
   scf.for %arg4 = %c0 to %c64 step %c1 {
     // CHECK-NOT: scf.for
     scf.for %arg5 = %c0 to %c64 step %c1 {
-      // CHECK: %[[IDX0:.+]] = arith.remsi %[[IV1]]
-      // CHECK: %[[IDX1:.+]] = arith.divsi %[[IV1]]
-      // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
+      // CHECK: %[[IDX:.+]]:2 = affine.delinearize_index
+      // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX]]#0, %[[IDX]]#1] : memref<64x64xf32, 1>
       %0 = memref.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
       %1 = memref.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
       %2 = arith.addf %0, %1 : f32
@@ -138,27 +133,22 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
-//      CHECK:   %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]]
-//  CHECK-DAG:   %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]]
-//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0
-//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1
-//      CHECK:   %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]]
-//  CHECK-DAG:   %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]]
-//      CHECK:   %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]]
-//  CHECK-DAG:   %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]]
-//      CHECK:   %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]]
-//      CHECK:   %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]]
+//      CHECK:   %[[NITERS0:.+]] = affine.apply
+// CHECK-SAME:       affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB0]], %[[UB0]], %[[STEP0]]]
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[NITERS1:.+]] = affine.apply
+// CHECK-SAME:       affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB1]], %[[UB1]], %[[STEP1]]]
+//      CHECK:   %[[NITERS2:.+]] = affine.apply
+// CHECK-SAME:        affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB2]], %[[UB2]], %[[STEP2]]]
+//      CHECK:   %[[NEWUB:.+]] = affine.apply affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] ->
+// CHECK-SAME:       ((((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5)) * ((-s6 + s7) ceildiv s8))
+// CHECK-SAME:       [%[[LB0]], %[[UB0]], %[[STEP0]], %[[LB1]], %[[UB1]], %[[STEP1]], %[[LB2]], %[[UB2]], %[[STEP2]]]
 //      CHECK:   %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]])
-//      CHECK:     %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]]
-//      CHECK:     %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]]
-//      CHECK:     %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]]
-//      CHECK:     %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]]
-//      CHECK:     %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]]
-//      CHECK:     %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]]
-//      CHECK:     %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]]
-//      CHECK:     %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]]
-//      CHECK:     %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]]
-//      CHECK:     %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]]
+//      CHECK:     %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[IV]] into (%[[NITERS0]], %[[NITERS1]], %[[NITERS2]])
+//  CHECK-DAG:     %[[K:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#2)[%[[LB2]], %[[STEP2]]]
+//  CHECK-DAG:     %[[J:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#1)[%[[LB1]], %[[STEP1]]]
+//  CHECK-DAG:     %[[I:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#0)[%[[LB0]], %[[STEP0]]]
 //      CHECK:     %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]])
 //      CHECK:     scf.yield %[[USE]]
 //      CHECK:   return %[[RESULT]]
@@ -201,8 +191,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for
-//      CHECK:     arith.remsi
-//      CHECK:     arith.divsi
+//      CHECK:     affine.delinearize_index
 //      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
@@ -245,8 +234,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for
-//      CHECK:     arith.remsi
-//      CHECK:     arith.divsi
+//      CHECK:     affine.delinearize_index
 //      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
@@ -289,13 +277,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]]
-//      CHECK:     arith.subi
-//      CHECK:     arith.ceildivsi
-//      CHECK:     arith.subi
-//      CHECK:     arith.ceildivsi
+//  CHECK-NOT:     affine.delinearize_index
 //      CHECK:     scf.for
-//      CHECK:       arith.remsi
-//      CHECK:       arith.divsi
+//      CHECK:       affine.delinearize_index
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
 
@@ -337,10 +321,9 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:     %[[ARG2:.+]]: index)
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//       CHECK:   %[[UB:.+]] = arith.muli %[[ARG1]], %[[ARG2]]
+//       CHECK:   %[[UB:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%[[ARG1]], %[[ARG2]]]
 //       CHECK:   scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
-//       CHECK:     %[[IV1:.+]] = arith.remsi %[[IV]], %[[ARG2]]
-//       CHECK:     %[[IV2:.+]] = arith.divsi %[[IV]], %[[ARG2]]
+//       CHECK:     %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[IV]](%[[ARG1]], %[[ARG2]])
 //       CHECK:     "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[IV2]], %[[C0]], %[[IV1]])
 
 // -----
diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir
index d1c23d584f92b7..dc4e042a3c4f56 100644
--- a/mlir/test/Transforms/parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' --mlir-print-local-scope | FileCheck %s
 
 // CHECK: func @parallel_many_dims() {
 func.func @parallel_many_dims() {
@@ -33,14 +33,11 @@ func.func @parallel_many_dims() {
 // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
 // CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) {
 // CHECK:   %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index
 // CHECK:   %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index
-// CHECK:   %[[V2:.*]] = arith.muli %[[V0]], %[[C10]]
-// CHECK:   %[[I3:.*]] = arith.addi %[[V2]], %[[C9]]
+// CHECK:   %[[I3:.*]] = affine.apply affine_map<(d0) -> (d0 * 10 + 9)>(%[[V0]])
 // CHECK:   "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index
 // CHECK:   scf.reduce
diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
index 4eed61a65aa475..1ef787bec1bb37 100644
--- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' --mlir-print-local-scope %s | FileCheck %s
 
 func.func @collapse_to_single() {
   %c0 = arith.constant 3 : index
@@ -14,20 +14,15 @@ func.func @collapse_to_single() {
 }
 
 // CHECK: func @collapse_to_single() {
-// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C3:.*]] = arith.constant 3 : index
-// CHECK-DAG:         %[[C7:.*]] = arith.constant 7 : index
-// CHECK-DAG:         %[[C4:.*]] = arith.constant 4 : index
 // CHECK-DAG:         %[[C6:.*]] = arith.constant 6 : index
+// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:         %[[C18:.*]] = arith.constant 18 : index
 // CHECK:         scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) {
 // CHECK:           %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index
 // CHECK:           %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index
-// CHECK:            %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]]
-// CHECK:           %[[I1:.*]] = arith.addi %[[V0]], %[[C7]]
-// CHECK:            %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]]
-// CHECK:           %[[I0:.*]] = arith.addi %[[V1]], %[[C3]]
+// CHECK:           %[[I1:.*]] = affine.apply affine_map<(d0) -> (d0 * 4 + 7)>(%[[I0_COUNT]])
+// CHECK:           %[[I0:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 3)>(%[[I1_COUNT]])
 // CHECK:           "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index
 // CHECK:           scf.reduce
 // CHECK-NEXT:    }