[Mlir-commits] [mlir] 1524b01 - [MLIR] Add loop coalesce utility for affine.for

Wed Sep 8 05:33:14 PDT 2021

Author: Arnab Dutta
Date: 2021-09-08T18:02:23+05:30
New Revision: 1524b0154116e2a404dba12c7f9f64c5c8533955

URL: https://github.com/llvm/llvm-project/commit/1524b0154116e2a404dba12c7f9f64c5c8533955
DIFF: https://github.com/llvm/llvm-project/commit/1524b0154116e2a404dba12c7f9f64c5c8533955.diff

LOG: [MLIR] Add loop coalesce utility for affine.for

Add loop coalesce utility for affine.for. This expects loops to have
been normalized a-priori. This works for both constant as well non
constant upper bounds having single/multiple result upper bound affine
map.

With contributions from Arnab Dutta and Uday Bondhugula.

Reviewed By: bondhugula, ayzhuang

Differential Revision: https://reviews.llvm.org/D108126

Added: 
    

Modified: 
    mlir/include/mlir/Transforms/LoopUtils.h
    mlir/lib/Transforms/LoopCoalescing.cpp
    mlir/lib/Transforms/Utils/LoopUtils.cpp
    mlir/test/Transforms/loop-coalescing.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index f6aa469e42530..8c851215e3c08 100644

--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -243,6 +243,14 @@ TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
 /// independent of any loop induction variable involved in the nest.
 void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
 
+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops outermost to innermost
+/// that are normalized (step one and lower bound of zero) and with bounds and
+/// steps independent of any loop induction variable involved in the nest.
+/// Coalescing affine.for loops is not always possible, i.e., the result may not
+/// be representable using affine.for.
+LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
+
 /// Take the ParallelLoop and for each set of dimension indices, combine them
 /// into a single dimension. combinedDimensions must contain each index into
 /// loops exactly once.

diff  --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp
index ed7bff36321e1..c18c6dcd20578 100644
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
@@ -20,65 +21,73 @@ using namespace mlir;
 
 namespace {
 struct LoopCoalescingPass : public LoopCoalescingBase<LoopCoalescingPass> {
-  void runOnFunction() override {
-    FuncOp func = getFunction();
 
-    func.walk([](scf::ForOp op) {
-      // Ignore nested loops.
-      if (op->getParentOfType<scf::ForOp>())
-        return;
+  /// Walk either an scf.for or an affine.for to find a band to coalesce.
+  template <typename LoopOpTy>
+  static void walkLoop(LoopOpTy op) {
+    // Ignore nested loops.
+    if (op->template getParentOfType<LoopOpTy>())
+      return;
 
-      SmallVector<scf::ForOp, 4> loops;
-      getPerfectlyNestedLoops(loops, op);
-      LLVM_DEBUG(llvm::dbgs()
-                 << "found a perfect nest of depth " << loops.size() << '\n');
+    SmallVector<LoopOpTy, 4> loops;
+    getPerfectlyNestedLoops(loops, op);
+    LLVM_DEBUG(llvm::dbgs()
+               << "found a perfect nest of depth " << loops.size() << '\n');
 
-      // Look for a band of loops that can be coalesced, i.e. perfectly nested
-      // loops with bounds defined above some loop.
-      // 1. For each loop, find above which parent loop its operands are
-      // defined.
-      SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
-      for (unsigned i = 0, e = loops.size(); i < e; ++i) {
-        operandsDefinedAbove[i] = i;
-        for (unsigned j = 0; j < i; ++j) {
-          if (areValuesDefinedAbove(loops[i].getOperands(),
-                                    loops[j].region())) {
-            operandsDefinedAbove[i] = j;
-            break;
-          }
+    // Look for a band of loops that can be coalesced, i.e. perfectly nested
+    // loops with bounds defined above some loop.
+    // 1. For each loop, find above which parent loop its operands are
+    // defined.
+    SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
+    for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+      operandsDefinedAbove[i] = i;
+      for (unsigned j = 0; j < i; ++j) {
+        if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].region())) {
+          operandsDefinedAbove[i] = j;
+          break;
         }
-        LLVM_DEBUG(llvm::dbgs()
-                   << "  bounds of loop " << i << " are known above depth "
-                   << operandsDefinedAbove[i] << '\n');
       }
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  bounds of loop " << i << " are known above depth "
+                 << operandsDefinedAbove[i] << '\n');
+    }
 
-      // 2. Identify bands of loops such that the operands of all of them are
-      // defined above the first loop in the band.  Traverse the nest bottom-up
-      // so that modifications don't invalidate the inner loops.
-      for (unsigned end = loops.size(); end > 0; --end) {
-        unsigned start = 0;
-        for (; start < end - 1; ++start) {
-          auto maxPos =
-              *std::max_element(std::next(operandsDefinedAbove.begin(), start),
-                                std::next(operandsDefinedAbove.begin(), end));
-          if (maxPos > start)
-            continue;
+    // 2. Identify bands of loops such that the operands of all of them are
+    // defined above the first loop in the band.  Traverse the nest bottom-up
+    // so that modifications don't invalidate the inner loops.
+    for (unsigned end = loops.size(); end > 0; --end) {
+      unsigned start = 0;
+      for (; start < end - 1; ++start) {
+        auto maxPos =
+            *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                              std::next(operandsDefinedAbove.begin(), end));
+        if (maxPos > start)
+          continue;
 
-          assert(maxPos == start &&
-                 "expected loop bounds to be known at the start of the band");
-          LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
-                                  << " to " << end << '\n');
+        assert(maxPos == start &&
+               "expected loop bounds to be known at the start of the band");
+        LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
+                                << " to " << end << '\n');
 
-          auto band =
-              llvm::makeMutableArrayRef(loops.data() + start, end - start);
-          coalesceLoops(band);
-          break;
-        }
-        // If a band was found and transformed, keep looking at the loops above
-        // the outermost transformed loop.
-        if (start != end - 1)
-          end = start + 1;
+        auto band =
+            llvm::makeMutableArrayRef(loops.data() + start, end - start);
+        (void)coalesceLoops(band);
+        break;
       }
+      // If a band was found and transformed, keep looking at the loops above
+      // the outermost transformed loop.
+      if (start != end - 1)
+        end = start + 1;
+    }
+  }
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    func.walk([&](Operation *op) {
+      if (auto scfForOp = dyn_cast<scf::ForOp>(op))
+        walkLoop(scfForOp);
+      else if (auto affineForOp = dyn_cast<AffineForOp>(op))
+        walkLoop(affineForOp);
     });
   }
 };

diff  --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 05b292ab72ed3..4427001be2204 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -2045,7 +2045,7 @@ void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
 
   builder.setInsertionPointToStart(outermost.getBody());
 
-  // 3. Remap induction variables.  For each original loop, the value of the
+  // 3. Remap induction variables. For each original loop, the value of the
   // induction variable can be obtained by dividing the induction variable of
   // the linearized loop by the total number of iterations of the loops nested
   // in it modulo the number of iterations in this loop (remove the values
@@ -2077,6 +2077,120 @@ void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
   second.erase();
 }
 
+LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
+  if (loops.size() < 2)
+    return success();
+
+  AffineForOp innermost = loops.back();
+  AffineForOp outermost = loops.front();
+  AffineBound ub = outermost.getUpperBound();
+  AffineMap origUbMap = ub.getMap();
+  Location loc = outermost.getLoc();
+  OpBuilder builder(outermost);
+  for (AffineForOp loop : loops) {
+    // We only work on normalized loops.
+    if (loop.getStep() != 1 || !loop.hasConstantLowerBound() ||
+        loop.getConstantLowerBound() != 0)
+      return failure();
+  }
+  SmallVector<Value, 4> upperBoundSymbols;
+  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
+                                   ub.getOperands().end());
+
+  // 1. Store the upper bound of the outermost loop in a variable.
+  Value prev;
+  if (!llvm::hasSingleElement(origUbMap.getResults()))
+    prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
+  else
+    prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
+  upperBoundSymbols.push_back(prev);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product of
+  // the number of iterations of all loops.
+  for (AffineForOp loop : loops.drop_front()) {
+    ub = loop.getUpperBound();
+    origUbMap = ub.getMap();
+    ubOperands = ub.getOperands();
+    Value upperBound;
+    // If upper bound map has more than one result, take their minimum.
+    if (!llvm::hasSingleElement(origUbMap.getResults()))
+      upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
+    else
+      upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
+    upperBoundSymbols.push_back(upperBound);
+    SmallVector<Value, 4> operands;
+    operands.push_back(prev);
+    operands.push_back(upperBound);
+    // Maintain running product of loop upper bounds.
+    prev = builder.create<AffineApplyOp>(
+        loc,
+        AffineMap::get(/*numDims=*/1,
+                       /*numSymbols=*/1,
+                       builder.getAffineDimExpr(0) *
+                           builder.getAffineSymbolExpr(0)),
+        operands);
+  }
+  // Set upper bound of the coalesced loop.
+  AffineMap newUbMap = AffineMap::get(
+      /*numDims=*/0,
+      /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
+  outermost.setUpperBound(prev, newUbMap);
+
+  builder.setInsertionPointToStart(outermost.getBody());
+
+  // 3. Remap induction variables. For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value previous = outermost.getInductionVar();
+  for (unsigned idx = loops.size(); idx > 0; --idx) {
+    if (idx != loops.size()) {
+      SmallVector<Value, 4> operands;
+      operands.push_back(previous);
+      operands.push_back(upperBoundSymbols[idx]);
+      previous = builder.create<AffineApplyOp>(
+          loc,
+          AffineMap::get(
+              /*numDims=*/1, /*numSymbols=*/1,
+              builder.getAffineDimExpr(0).floorDiv(
+                  builder.getAffineSymbolExpr(0))),
+          operands);
+    }
+    // Modified value of the induction variables of the nested loops after
+    // coalescing.
+    Value inductionVariable;
+    if (idx == 1) {
+      inductionVariable = previous;
+    } else {
+      SmallVector<Value, 4> applyOperands;
+      applyOperands.push_back(previous);
+      applyOperands.push_back(upperBoundSymbols[idx - 1]);
+      inductionVariable = builder.create<AffineApplyOp>(
+          loc,
+          AffineMap::get(
+              /*numDims=*/1, /*numSymbols=*/1,
+              builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
+          applyOperands);
+    }
+    replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
+                               inductionVariable, loops.back().region());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  AffineForOp secondOutermostLoop = loops[1];
+  innermost.getBody()->back().erase();
+  outermost.getBody()->getOperations().splice(
+      Block::iterator(secondOutermostLoop.getOperation()),
+      innermost.getBody()->getOperations());
+  secondOutermostLoop.erase();
+  return success();
+}
+
 void mlir::collapseParallelLoops(
     scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
   OpBuilder outsideBuilder(loops);

diff  --git a/mlir/test/Transforms/loop-coalescing.mlir b/mlir/test/Transforms/loop-coalescing.mlir
index fc219d4e186a4..074ca201c2312 100644
--- a/mlir/test/Transforms/loop-coalescing.mlir
+++ b/mlir/test/Transforms/loop-coalescing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s
 
 // CHECK-LABEL: @one_3d_nest
 func @one_3d_nest() {
@@ -191,3 +191,170 @@ func @two_bands() {
   }
   return
 }
+
+// -----
+
+// Check coalescing of affine.for loops when all the loops have constant upper bound.
+// CHECK-DAG: #[[SIXTEEN:.*]] = affine_map<() -> (16)>
+// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
+// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[EIGHT:.*]] = affine_map<() -> (8)>
+// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for() {
+  affine.for %i = 0 to 16 {
+    affine.for %j = 0 to 64 {
+      affine.for %k = 0 to 8 {
+        "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK-DAG: %[[T0:.*]] = affine.apply #[[SIXTEEN]]()
+// CHECK-DAG: %[[T1:.*]] = affine.apply #[[SIXTY_FOUR]]()
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[EIGHT]]()
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
+// CHECK:       affine.for %[[IV:.*]] = 0 to %[[T4]]
+// CHECK-DAG:    %[[K:.*]] =  affine.apply #[[MOD]](%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[T6:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[J:.*]] =  affine.apply #[[MOD]](%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] =  affine.apply #[[DIV]](%[[T6]])[%[[T1]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// Check coalescing of affine.for loops when all the loops have non constant upper bounds.
+// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %K = memref.dim %arg0, %c0 : memref<?x?xf32>
+  affine.for %i = 0 to %M {
+    affine.for %j = 0 to %N {
+      affine.for %k = 0 to %K {
+      "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
+// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// Check coalescing of affine.for loops when some of the loop has constant upper bounds while others have nin constant upper bounds.
+// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
+// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+  affine.for %i = 0 to %M {
+    affine.for %j = 0 to %N {
+      affine.for %k = 0 to 64 {
+      "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[SIXTY_FOUR]]()
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[PRODUCT]](%[[T4]])[%[[T5]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T5]]]
+// CHECK-DAG:    %[[T8:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T5]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T8]])[%[[T3]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T8]])[%[[T3]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// Check coalescing of affine.for loops when upper bound contains multi result upper bound map.
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)>
+// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+#myMap = affine_map<()[s1] -> (s1, -s1)>
+func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
+ %c0 = constant 0 : index
+ %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %N = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %K = memref.dim %arg0, %c0 : memref<?x?xf32>
+ affine.for %i = 0 to min #myMap()[%M] {
+   affine.for %j = 0 to %N {
+     affine.for %k = 0 to %K {
+     "test.foo"(%i, %j, %k) : (index, index, index) -> ()
+     }
+   }
+ }
+ return
+}
+// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]]
+// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
+// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
+// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
+// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T6]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T9]])[%[[T4]]]
+// CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 110)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (696, d0 * 110 + 110)>
+#map0 = affine_map<(d0) -> (d0 * 110)>
+#map1 = affine_map<(d0) -> (696, d0 * 110 + 110)>
+func @test_loops_do_not_get_coalesced() {
+  affine.for %i = 0 to 7 {
+    affine.for %j = #map0(%i) to min #map1(%i) {
+    }
+  }
+  return
+}
+// CHECK: affine.for %[[IV0:.*]] = 0 to 7
+// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]])
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: return