[Mlir-commits] [mlir] e4dd964 - [mlir] Loop bounds inference in linalg.generic op improved to support bounds for convolution

Thu Jul 23 02:02:03 PDT 2020

Author: Jakub Lichman
Date: 2020-07-23T11:01:54+02:00
New Revision: e4dd964df0164651f1804612ad41582fb801607f

URL: https://github.com/llvm/llvm-project/commit/e4dd964df0164651f1804612ad41582fb801607f
DIFF: https://github.com/llvm/llvm-project/commit/e4dd964df0164651f1804612ad41582fb801607f.diff

LOG: [mlir] Loop bounds inference in linalg.generic op improved to support bounds for convolution

Loop bound inference is right now very limited as it supports only permutation maps and thus
it is impossible to implement convolution with linalg.generic as it requires more advanced
loop bound inference. This commits solves it for the convolution case.

Depends On D83158

Differential Revision: https://reviews.llvm.org/D83191

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
    mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
    mlir/lib/Dialect/Linalg/Utils/Utils.cpp
    mlir/test/Dialect/Linalg/loops.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 769f4894a2b7..0dbf863c2f69 100644

--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -116,15 +116,16 @@ SmallVector<Value, 8> getViewSizes(OpBuilder &builder, ConcreteOp linalgOp) {
     for (unsigned idx = 0; idx < attr.getInt(); idx++)
       symbolsPos += ranks[idx];
 
-    // Append or rewrite the end of the value list that corresponds to the
+    // Append the end of the value list that corresponds to the
     // values mapping to symbols. Since inside concatinated map symbols are
     // repeated we have to repeat the sizes as well.
-    for (unsigned idx = 0, s = ranks.size(); idx < s; ++idx) {
-      for (unsigned idx2 = 0; idx2 < numSymb; ++idx2) {
-        Value viewSize = res[symbolsPos + idx2];
-        res.push_back(viewSize);
-      }
-    }
+
+    // Reserve is mandatory to avoid a potential undefined behavior with
+    // pushing back to smallvector from itself.
+    res.reserve(res.size() + ranks.size() * numSymb);
+    for (unsigned idx = 0, s = ranks.size(); idx < s; ++idx)
+      for (unsigned idx2 = 0; idx2 < numSymb; ++idx2)
+        res.push_back(res[symbolsPos + idx2]);
   }
   return res;
 }
@@ -134,7 +135,7 @@ SmallVector<Value, 8> getViewSizes(OpBuilder &builder, ConcreteOp linalgOp) {
 /// `createAndFold` builder method. If `folder` is null, the regular `create`
 /// method is called.
 SmallVector<Value, 4> applyMapToValues(OpBuilder &b, Location loc,
-                                       AffineMap map, ArrayRef<Value> values,
+                                       AffineMap map, ValueRange values,
                                        OperationFolder *folder = nullptr);
 
 /// Returns all the operands of `linalgOp` that are not views.

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index b5643e997da5..b256852ced26 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -76,6 +76,74 @@ emitLoopRanges(OpBuilder &b, Location loc, AffineMap map,
   return res;
 }
 
+/// Creates a number of ranges equal to the number of dimensions in the `map`.
+/// The function supports for now only limited number of expressions inside
+/// map results. It expects a non-inverted, concatenated map and last values in
+/// viewSizes will be applied to the symbols in the map.
+static SmallVector<SubViewOp::Range, 4>
+emitLoopRangesWithSymbols(OpBuilder &b, Location loc, AffineMap map,
+                          ValueRange viewSizes) {
+  unsigned numDims = map.getNumDims(), numRes = map.getNumResults();
+  unsigned numSym = map.getNumSymbols();
+  assert(viewSizes.size() == numRes + numSym &&
+         "viewSizes must contain sizes of all views and values for symbols");
+  SmallVector<SubViewOp::Range, 4> res(numDims);
+  for (unsigned idx = 0; idx < numRes; ++idx) {
+    auto result = map.getResult(idx);
+    if (auto d = result.dyn_cast<AffineDimExpr>()) {
+      if (res[d.getPosition()].offset)
+        continue;
+      res[d.getPosition()] = SubViewOp::Range{
+          std_constant_index(0), viewSizes[idx], std_constant_index(1)};
+    }
+
+    // If the access pattern is of form (m, n)[s] -> (m + n - s floordiv 2),
+    // then the bounds are:
+    //   (s floordiv 2) <= m <= (size(m) + s floordiv 2 - s + 1).
+    // where size(n) is applied to the symbol s.
+    // This is done statically now.
+    if (auto binOp = result.dyn_cast<AffineBinaryOpExpr>()) {
+      auto lhs = binOp.getLHS().dyn_cast<AffineBinaryOpExpr>();
+      auto rhs = binOp.getRHS().dyn_cast<AffineBinaryOpExpr>();
+      if (!lhs || !rhs || binOp.getKind() != AffineExprKind::Add ||
+          lhs.getKind() != AffineExprKind::Add ||
+          rhs.getKind() != mlir::AffineExprKind::Mul)
+        continue;
+
+      auto m = lhs.getLHS().dyn_cast<AffineDimExpr>();
+      auto n = lhs.getRHS().dyn_cast<AffineDimExpr>();
+      auto fDiv = rhs.getLHS().dyn_cast<AffineBinaryOpExpr>();
+      auto minusOne = rhs.getRHS().dyn_cast<AffineConstantExpr>();
+      if (!m || !n || !fDiv || !minusOne ||
+          fDiv.getKind() != AffineExprKind::FloorDiv ||
+          fDiv.getLHS().getKind() != AffineExprKind::SymbolId ||
+          fDiv.getRHS().getKind() != AffineExprKind::Constant)
+        continue;
+
+      auto s = fDiv.getLHS().dyn_cast<AffineSymbolExpr>();
+      if (minusOne.getValue() != -1)
+        continue;
+
+      int mPos = m.getPosition();
+      AffineExpr one = getAffineConstantExpr(1, s.getContext());
+      AffineExpr sizeOfM = getAffineSymbolExpr(numSym, s.getContext());
+      // Construction of upper bound (size(m) + s floordiv 2 - s + 1).
+      AffineExpr upperOffsetExpr = sizeOfM + fDiv + one - s;
+      AffineMap fromMap = AffineMap::get(numDims, numSym + 1, fDiv);
+      AffineMap toMap = AffineMap::get(numDims, numSym + 1, upperOffsetExpr);
+      SmallVector<Value, 8> values(viewSizes.begin(),
+                                   viewSizes.begin() + numDims);
+      values.insert(values.end(), viewSizes.begin() + numRes, viewSizes.end());
+      values.push_back(viewSizes[mPos]);
+      // Construction of the lower bound (s floordiv 2).
+      Value from = applyMapToValues(b, loc, fromMap, values).front();
+      Value to = applyMapToValues(b, loc, toMap, values).front();
+      res[mPos] = SubViewOp::Range{from, to, std_constant_index(1)};
+    }
+  }
+  return res;
+}
+
 template <typename IndexedValueType, typename OpType>
 static void inlineRegionAndEmitStore(OpType op, ArrayRef<Value> indexedValues,
                                      ArrayRef<SmallVector<Value, 8>> indexing,
@@ -469,32 +537,24 @@ Optional<LinalgLoops> linalgOpToLoopsImpl(Operation *op, OpBuilder &builder) {
       llvm::map_range(mapsRange, [](AffineMapAttr a) { return a.getValue(); }));
   SmallVector<Value, 8> sizes = getViewSizes(builder, linalgOp);
   AffineMap map = concatAffineMaps(maps);
+  SmallVector<SubViewOp::Range, 4> loopRanges;
+
   if (map.getNumSymbols()) {
-    // Ignore symbols for now as they are not supported by inversePermutation.
-    unsigned dims = map.getNumDims();
-    SmallVector<AffineExpr, 8> zeros(
-        map.getNumSymbols(), getAffineConstantExpr(0, map.getContext()));
-    SmallVector<AffineExpr, 8> res;
-    for (auto result : map.getResults())
-      res.push_back(result.replaceDimsAndSymbols({}, zeros));
-
-    map = AffineMap::get(dims, 0, res, map.getContext());
-
-    // Cut off values that would have been applied to symbols
-    sizes.resize(res.size());
-  }
+    loopRanges = emitLoopRangesWithSymbols(scope.getBuilderRef(),
+                                           scope.getLocation(), map, sizes);
+  } else {
+    AffineMap invertedMap = inversePermutation(map);
+    if (!invertedMap)
+      return {};
+    if (invertedMap.isEmpty()) {
+      emitScalarImplementation<IndexedValueTy>({}, linalgOp);
+      return LinalgLoops();
+    }
 
-  AffineMap invertedMap = inversePermutation(map);
-  if (!invertedMap)
-    return {};
-  if (invertedMap.isEmpty()) {
-    emitScalarImplementation<IndexedValueTy>({}, linalgOp);
-    return LinalgLoops();
+    loopRanges = emitLoopRanges(scope.getBuilderRef(), scope.getLocation(),
+                                invertedMap, sizes);
   }
-
   SmallVector<Value, 4> allIvs;
-  auto loopRanges = emitLoopRanges(scope.getBuilderRef(), scope.getLocation(),
-                                   invertedMap, sizes);
   GenerateLoopNest<LoopTy>::doit(
       loopRanges, linalgOp.iterator_types().getValue(), [&](ValueRange ivs) {
         allIvs.append(ivs.begin(), ivs.end());

diff  --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index a9d5e2028c22..968f5b1d82c5 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -57,7 +57,7 @@ RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
 
 static Value emitOrFoldComposedAffineApply(OpBuilder &b, Location loc,
                                            AffineMap map,
-                                           ArrayRef<Value> operandsRef,
+                                           ValueRange operandsRef,
                                            OperationFolder *folder) {
   SmallVector<Value, 4> operands(operandsRef.begin(), operandsRef.end());
   fullyComposeAffineMapAndOperands(&map, &operands);
@@ -68,16 +68,16 @@ static Value emitOrFoldComposedAffineApply(OpBuilder &b, Location loc,
 
 SmallVector<Value, 4> mlir::linalg::applyMapToValues(OpBuilder &b, Location loc,
                                                      AffineMap map,
-                                                     ArrayRef<Value> values,
+                                                     ValueRange values,
                                                      OperationFolder *folder) {
   SmallVector<Value, 4> res;
   res.reserve(map.getNumResults());
-  unsigned numDims = map.getNumDims();
+  unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols();
   // For each `expr` in `map`, applies the `expr` to the values extracted from
   // ranges. If the resulting application can be folded into a Value, the
   // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
   for (auto expr : map.getResults()) {
-    AffineMap map = AffineMap::get(numDims, 0, expr);
+    AffineMap map = AffineMap::get(numDims, numSym, expr);
     res.push_back(emitOrFoldComposedAffineApply(b, loc, map, values, folder));
   }
   return res;

diff  --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
index b3f6160b17ed..30bb90bdd43a 100644
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -14,6 +14,8 @@
 // CHECKLOOP-DAG: #[[$stride2Dilation1:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1)>
 // CHECKLOOP-DAG: #[[$stride2Dilation4:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1 * 4)>
 // CHECKLOOP-DAG: #[[$stride3Dilation5:.*]] = affine_map<(d0, d1) -> (d0 * 3 + d1 * 5)>
+// CHECKLOOP-DAG: #[[$convLowerBound:.*]] = affine_map<()[s0] -> (s0 floordiv 2)>
+// CHECKLOOP-DAG: #[[$convUpperBound:.*]] = affine_map<()[s0, s1] -> (s1 + s0 floordiv 2 - s0 + 1)>
 // CHECKLOOP-DAG: #[[$convMap:.*]] = affine_map<(d0, d1)[s0] -> (d0 + d1 - s0 floordiv 2)>
 
 // CHECKPARALLEL-DAG: #[[$strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
@@ -26,6 +28,8 @@
 // CHECKPARALLEL-DAG: #[[$stride2Dilation1:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1)>
 // CHECKPARALLEL-DAG: #[[$stride2Dilation4:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1 * 4)>
 // CHECKPARALLEL-DAG: #[[$stride3Dilation5:.*]] = affine_map<(d0, d1) -> (d0 * 3 + d1 * 5)>
+// CHECKPARALLEL-DAG: #[[$convLowerBound:.*]] = affine_map<()[s0] -> (s0 floordiv 2)>
+// CHECKPARALLEL-DAG: #[[$convUpperBound:.*]] = affine_map<()[s0, s1] -> (s1 + s0 floordiv 2 - s0 + 1)>
 // CHECKPARALLEL-DAG: #[[$convMap:.*]] = affine_map<(d0, d1)[s0] -> (d0 + d1 - s0 floordiv 2)>
 
 
@@ -947,10 +951,12 @@ func @conv1d(%in : memref<?xf32>, %filter : memref<?xf32>, %out :  memref<?xf32>
 //  CHECKLOOP-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECKLOOP-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?xf32>
 //       CHECKLOOP: %[[c0:.*]] = constant 0 : index
-//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
-//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg2]], %[[c0]] : memref<?xf32>
-//       CHECKLOOP: scf.for %[[b:.*]] = %{{.*}} to %[[dim1]] step %{{.*}} {
-//       CHECKLOOP:   scf.for %[[m:.*]] = %{{.*}} to %[[dim0]] step %{{.*}} {
+//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?xf32>
+//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
+//       CHECKLOOP: %[[lowerBound:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim1]]]
+//       CHECKLOOP: %[[upperBound:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim1]], %[[dim0]]]
+//       CHECKLOOP: scf.for %[[b:.*]] = %[[lowerBound]] to %[[upperBound]] step %{{.*}} {
+//       CHECKLOOP:   scf.for %[[m:.*]] = %{{.*}} to %[[dim1]] step %{{.*}} {
 //       CHECKLOOP:     %[[dim2:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
 //       CHECKLOOP:     %[[aff:.*]] = affine.apply #[[$convMap]](%{{.*}}, %{{.*}})[%[[dim2]]]
 //       CHECKLOOP:     %[[va:.*]] = load %[[arg0]][%[[aff]]] : memref<?xf32>
@@ -965,9 +971,11 @@ func @conv1d(%in : memref<?xf32>, %filter : memref<?xf32>, %out :  memref<?xf32>
 //  CHECKPARALLEL-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECKPARALLEL-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?xf32>
 //       CHECKPARALLEL: %[[c0:.*]] = constant 0 : index
-//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
-//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg2]], %[[c0]] : memref<?xf32>
-//       CHECKPARALLEL: scf.parallel (%[[b:.*]], %[[m:.*]]) = (%{{.*}}, %{{.*}}) to (%[[dim1]], %[[dim0]]) step ({{.*}}) {
+//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?xf32>
+//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
+//       CHECKPARALLEL: %[[lowerBound:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim1]]]
+//       CHECKPARALLEL: %[[upperBound:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim1]], %[[dim0]]]
+//       CHECKPARALLEL: scf.parallel (%[[b:.*]], %[[m:.*]]) = (%[[lowerBound]], %{{.*}}) to (%[[upperBound]], %[[dim1]]) step ({{.*}}) {
 //       CHECKPARALLEL:   %[[dim2:.*]] = dim %[[arg1]], %[[c0]] : memref<?xf32>
 //       CHECKPARALLEL:   %[[aff:.*]] = affine.apply #[[$convMap]](%{{.*}}, %{{.*}})[%[[dim2]]]
 //       CHECKPARALLEL:   %[[va:.*]] = load %[[arg0]][%[[aff]]] : memref<?xf32>
@@ -1012,14 +1020,18 @@ func @conv2d(%in : memref<?x?xf32>, %filter : memref<?x?xf32>, %out :  memref<?x
 //  CHECKLOOP-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //       CHECKLOOP: %[[c0:.*]] = constant 0 : index
 //       CHECKLOOP: %[[c1:.*]] = constant 1 : index
-//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
-//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
-//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?xf32>
-//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?xf32>
-//       CHECKLOOP: scf.for %[[i0:.*]] = %{{.*}} to %[[dim2]] step %{{.*}} {
-//       CHECKLOOP:   scf.for %[[i1:.*]] = %{{.*}} to %[[dim3]] step %{{.*}} {
-//       CHECKLOOP:     scf.for %[[i2:.*]] = %{{.*}} to %[[dim0]] step %{{.*}} {
-//       CHECKLOOP:       scf.for %[[i3:.*]] = %{{.*}} to %[[dim1]] step %{{.*}} {
+//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?xf32>
+//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?xf32>
+//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
+//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
+//       CHECKLOOP: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim2]]]
+//       CHECKLOOP: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim2]], %[[dim0]]]
+//       CHECKLOOP: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim3]]]
+//       CHECKLOOP: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim3]], %[[dim1]]]
+//       CHECKLOOP: scf.for %[[i0:.*]] = %[[lowerBound1]] to %[[upperBound1]] step %{{.*}} {
+//       CHECKLOOP:   scf.for %[[i1:.*]] = %[[lowerBound2]] to %[[upperBound2]] step %{{.*}} {
+//       CHECKLOOP:     scf.for %[[i2:.*]] = %{{.*}} to %[[dim2]] step %{{.*}} {
+//       CHECKLOOP:       scf.for %[[i3:.*]] = %{{.*}} to %[[dim3]] step %{{.*}} {
 //       CHECKLOOP:         %[[dim4:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
 //       CHECKLOOP:         %[[dim5:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
 //       CHECKLOOP:         %[[aff1:.*]] = affine.apply #[[$convMap]](%{{.*}}, %{{.*}})[%[[dim4]]]
@@ -1037,11 +1049,15 @@ func @conv2d(%in : memref<?x?xf32>, %filter : memref<?x?xf32>, %out :  memref<?x
 //  CHECKPARALLEL-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //       CHECKPARALLEL: %[[c0:.*]] = constant 0 : index
 //       CHECKPARALLEL: %[[c1:.*]] = constant 1 : index
-//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
-//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
-//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?xf32>
-//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?xf32>
-//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]]) = (%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) to (%[[dim2]], %[[dim3]], %[[dim0]], %[[dim1]]) step ({{.*}}) {
+//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?xf32>
+//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?xf32>
+//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
+//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
+//       CHECKPARALLEL: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim2]]]
+//       CHECKPARALLEL: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim2]], %[[dim0]]]
+//       CHECKPARALLEL: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim3]]]
+//       CHECKPARALLEL: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim3]], %[[dim1]]]
+//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]]) = (%[[lowerBound1]], %[[lowerBound2]], %{{.*}}, %{{.*}}) to (%[[upperBound1]], %[[upperBound2]], %[[dim2]], %[[dim3]]) step ({{.*}}) {
 //       CHECKPARALLEL:   %[[dim4:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?xf32>
 //       CHECKPARALLEL:   %[[dim5:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?xf32>
 //       CHECKPARALLEL:   %[[aff1:.*]] = affine.apply #[[$convMap]](%{{.*}}, %{{.*}})[%[[dim4]]]
@@ -1089,18 +1105,24 @@ func @conv3d(%in : memref<?x?x?xf32>, %filter : memref<?x?x?xf32>, %out :  memre
 //       CHECKLOOP: %[[c0:.*]] = constant 0 : index
 //       CHECKLOOP: %[[c1:.*]] = constant 1 : index
 //       CHECKLOOP: %[[c2:.*]] = constant 2 : index
-//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
-//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
-//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
-//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?x?xf32>
-//       CHECKLOOP: %[[dim4:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?x?xf32>
-//       CHECKLOOP: %[[dim5:.*]] = dim %[[arg2]], %[[c2]] : memref<?x?x?xf32>
-//       CHECKLOOP: scf.for %[[i0:.*]] = %{{.*}} to %[[dim3]] step %{{.*}} {
-//       CHECKLOOP:   scf.for %[[i1:.*]] = %{{.*}} to %[[dim4]] step %{{.*}} {
-//       CHECKLOOP:     scf.for %[[i2:.*]] = %{{.*}} to %[[dim5]] step %{{.*}} {
-//       CHECKLOOP:       scf.for %[[i3:.*]] = %{{.*}} to %[[dim0]] step %{{.*}} {
-//       CHECKLOOP:         scf.for %[[i4:.*]] = %{{.*}} to %[[dim1]] step %{{.*}} {
-//       CHECKLOOP:           scf.for %[[i5:.*]] = %{{.*}} to %[[dim2]] step %{{.*}} {
+//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg0]], %[[c2]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[dim4:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[dim5:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
+//       CHECKLOOP: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim3]]]
+//       CHECKLOOP: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim3]], %[[dim0]]]
+//       CHECKLOOP: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim4]]]
+//       CHECKLOOP: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim4]], %[[dim1]]]
+//       CHECKLOOP: %[[lowerBound3:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim5]]]
+//       CHECKLOOP: %[[upperBound3:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim5]], %[[dim2]]]
+//       CHECKLOOP: scf.for %[[i0:.*]] = %[[lowerBound1]] to %[[upperBound1]] step %{{.*}} {
+//       CHECKLOOP:   scf.for %[[i1:.*]] = %[[lowerBound2]] to %[[upperBound2]] step %{{.*}} {
+//       CHECKLOOP:     scf.for %[[i2:.*]] = %[[lowerBound3]] to %[[upperBound3]] step %{{.*}} {
+//       CHECKLOOP:       scf.for %[[i3:.*]] = %{{.*}} to %[[dim3]] step %{{.*}} {
+//       CHECKLOOP:         scf.for %[[i4:.*]] = %{{.*}} to %[[dim4]] step %{{.*}} {
+//       CHECKLOOP:           scf.for %[[i5:.*]] = %{{.*}} to %[[dim5]] step %{{.*}} {
 //       CHECKLOOP:             %[[dim6:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
 //       CHECKLOOP:             %[[dim7:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
 //       CHECKLOOP:             %[[dim8:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
@@ -1121,13 +1143,19 @@ func @conv3d(%in : memref<?x?x?xf32>, %filter : memref<?x?x?xf32>, %out :  memre
 //       CHECKPARALLEL: %[[c0:.*]] = constant 0 : index
 //       CHECKPARALLEL: %[[c1:.*]] = constant 1 : index
 //       CHECKPARALLEL: %[[c2:.*]] = constant 2 : index
-//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: %[[dim4:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: %[[dim5:.*]] = dim %[[arg2]], %[[c2]] : memref<?x?x?xf32>
-//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]], %[[i4:.*]], %[[i5:.*]]) = (%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) to (%[[dim3]], %[[dim4]], %[[dim5]], %[[dim0]], %[[dim1]], %[[dim2]]) step ({{.*}}) {
+//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg0]], %[[c2]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[dim4:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[dim5:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
+//       CHECKPARALLEL: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim3]]]
+//       CHECKPARALLEL: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim3]], %[[dim0]]]
+//       CHECKPARALLEL: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim4]]]
+//       CHECKPARALLEL: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim4]], %[[dim1]]]
+//       CHECKPARALLEL: %[[lowerBound3:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim5]]]
+//       CHECKPARALLEL: %[[upperBound3:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim5]], %[[dim2]]]
+//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]], %[[i4:.*]], %[[i5:.*]]) = (%[[lowerBound1]], %[[lowerBound2]], %[[lowerBound3]], %{{.*}}, %{{.*}}, %{{.*}}) to (%[[upperBound1]], %[[upperBound2]], %[[upperBound3]], %[[dim3]], %[[dim4]], %[[dim5]]) step ({{.*}}) {
 //       CHECKPARALLEL:   %[[dim6:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?xf32>
 //       CHECKPARALLEL:   %[[dim7:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?xf32>
 //       CHECKPARALLEL:   %[[dim8:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?xf32>
@@ -1178,22 +1206,30 @@ func @conv4d(%in : memref<?x?x?x?xf32>, %filter : memref<?x?x?x?xf32>, %out :  m
 //       CHECKLOOP: %[[c1:.*]] = constant 1 : index
 //       CHECKLOOP: %[[c2:.*]] = constant 2 : index
 //       CHECKLOOP: %[[c3:.*]] = constant 3 : index
-//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg1]], %[[c3]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim4:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim5:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim6:.*]] = dim %[[arg2]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: %[[dim7:.*]] = dim %[[arg2]], %[[c3]] : memref<?x?x?x?xf32>
-//       CHECKLOOP: scf.for %[[i0:.*]] = %{{.*}} to %[[dim4]] step %{{.*}} {
-//       CHECKLOOP:   scf.for %[[i1:.*]] = %{{.*}} to %[[dim5]] step %{{.*}} {
-//       CHECKLOOP:     scf.for %[[i2:.*]] = %{{.*}} to %[[dim6]] step %{{.*}} {
-//       CHECKLOOP:       scf.for %[[i3:.*]] = %{{.*}} to %[[dim7]] step %{{.*}} {
-//       CHECKLOOP:         scf.for %[[i4:.*]] = %{{.*}} to %[[dim0]] step %{{.*}} {
-//       CHECKLOOP:           scf.for %[[i5:.*]] = %{{.*}} to %[[dim1]] step %{{.*}} {
-//       CHECKLOOP:             scf.for %[[i6:.*]] = %{{.*}} to %[[dim2]] step %{{.*}} {
-//       CHECKLOOP:               scf.for %[[i7:.*]] = %{{.*}} to %[[dim3]] step %{{.*}} {
+//       CHECKLOOP: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim2:.*]] = dim %[[arg0]], %[[c2]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim3:.*]] = dim %[[arg0]], %[[c3]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim4:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim5:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim6:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[dim7:.*]] = dim %[[arg1]], %[[c3]] : memref<?x?x?x?xf32>
+//       CHECKLOOP: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim4]]]
+//       CHECKLOOP: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim4]], %[[dim0]]]
+//       CHECKLOOP: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim5]]]
+//       CHECKLOOP: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim5]], %[[dim1]]]
+//       CHECKLOOP: %[[lowerBound3:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim6]]]
+//       CHECKLOOP: %[[upperBound3:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim6]], %[[dim2]]]
+//       CHECKLOOP: %[[lowerBound4:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim7]]]
+//       CHECKLOOP: %[[upperBound4:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim7]], %[[dim3]]]
+//       CHECKLOOP: scf.for %[[i0:.*]] = %[[lowerBound1]] to %[[upperBound1]] step %{{.*}} {
+//       CHECKLOOP:   scf.for %[[i1:.*]] = %[[lowerBound2]] to %[[upperBound2]] step %{{.*}} {
+//       CHECKLOOP:     scf.for %[[i2:.*]] = %[[lowerBound3]] to %[[upperBound3]] step %{{.*}} {
+//       CHECKLOOP:       scf.for %[[i3:.*]] = %[[lowerBound4]] to %[[upperBound4]] step %{{.*}} {
+//       CHECKLOOP:         scf.for %[[i4:.*]] = %{{.*}} to %[[dim4]] step %{{.*}} {
+//       CHECKLOOP:           scf.for %[[i5:.*]] = %{{.*}} to %[[dim5]] step %{{.*}} {
+//       CHECKLOOP:             scf.for %[[i6:.*]] = %{{.*}} to %[[dim6]] step %{{.*}} {
+//       CHECKLOOP:               scf.for %[[i7:.*]] = %{{.*}} to %[[dim7]] step %{{.*}} {
 //       CHECKLOOP:                 %[[dim8:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
 //       CHECKLOOP:                 %[[dim9:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
 //       CHECKLOOP:                 %[[dim10:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>
@@ -1217,15 +1253,23 @@ func @conv4d(%in : memref<?x?x?x?xf32>, %filter : memref<?x?x?x?xf32>, %out :  m
 //       CHECKPARALLEL: %[[c1:.*]] = constant 1 : index
 //       CHECKPARALLEL: %[[c2:.*]] = constant 2 : index
 //       CHECKPARALLEL: %[[c3:.*]] = constant 3 : index
-//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg1]], %[[c3]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim4:.*]] = dim %[[arg2]], %[[c0]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim5:.*]] = dim %[[arg2]], %[[c1]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim6:.*]] = dim %[[arg2]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: %[[dim7:.*]] = dim %[[arg2]], %[[c3]] : memref<?x?x?x?xf32>
-//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]], %[[i4:.*]], %[[i5:.*]], %[[i6:.*]], %[[i7:.*]]) = (%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) to (%[[dim4]], %[[dim5]], %[[dim6]], %[[dim7]], %[[dim0]], %[[dim1]], %[[dim2]], %[[dim3]]) step ({{.*}}) {
+//       CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg0]], %[[c0]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg0]], %[[c1]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg0]], %[[c2]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg0]], %[[c3]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim4:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim5:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim6:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[dim7:.*]] = dim %[[arg1]], %[[c3]] : memref<?x?x?x?xf32>
+//       CHECKPARALLEL: %[[lowerBound1:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim4]]]
+//       CHECKPARALLEL: %[[upperBound1:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim4]], %[[dim0]]]
+//       CHECKPARALLEL: %[[lowerBound2:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim5]]]
+//       CHECKPARALLEL: %[[upperBound2:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim5]], %[[dim1]]]
+//       CHECKPARALLEL: %[[lowerBound3:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim6]]]
+//       CHECKPARALLEL: %[[upperBound3:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim6]], %[[dim2]]]
+//       CHECKPARALLEL: %[[lowerBound4:.*]] = affine.apply #[[$convLowerBound]]()[%[[dim7]]]
+//       CHECKPARALLEL: %[[upperBound4:.*]] = affine.apply #[[$convUpperBound]]()[%[[dim7]], %[[dim3]]]
+//       CHECKPARALLEL: scf.parallel (%[[i0:.*]], %[[i1:.*]], %[[i2:.*]], %[[i3:.*]], %[[i4:.*]], %[[i5:.*]], %[[i6:.*]], %[[i7:.*]]) = (%[[lowerBound1]], %[[lowerBound2]], %[[lowerBound3]], %[[lowerBound4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) to (%[[upperBound1]], %[[upperBound2]], %[[upperBound3]], %[[upperBound4]], %[[dim4]], %[[dim5]], %[[dim6]], %[[dim7]]) step ({{.*}}) {
 //       CHECKPARALLEL:   %[[dim8:.*]] = dim %[[arg1]], %[[c0]] : memref<?x?x?x?xf32>
 //       CHECKPARALLEL:   %[[dim9:.*]] = dim %[[arg1]], %[[c1]] : memref<?x?x?x?xf32>
 //       CHECKPARALLEL:   %[[dim10:.*]] = dim %[[arg1]], %[[c2]] : memref<?x?x?x?xf32>