[Mlir-commits] [mlir] 2108ed0 - [MLIR] Fix affine.for unroll for multi-result upper bound maps

Fri Dec 3 17:51:13 PST 2021

Author: Uday Bondhugula
Date: 2021-12-04T07:20:26+05:30
New Revision: 2108ed0671c0581efed8de929c862dbbc3e31c24

URL: https://github.com/llvm/llvm-project/commit/2108ed0671c0581efed8de929c862dbbc3e31c24
DIFF: https://github.com/llvm/llvm-project/commit/2108ed0671c0581efed8de929c862dbbc3e31c24.diff

LOG: [MLIR] Fix affine.for unroll for multi-result upper bound maps

Fix affine.for unroll for multi-result upper bound maps: these can't be
unrolled/unroll-and-jammed in cases where the trip count isn't known to
be a multiple of the unroll factor.

Fix and clean up repeated/unnecessary checks/comments at helper callees.

Also, fix clang-tidy variable naming warnings and redundant includes.

Differential Revision: https://reviews.llvm.org/D114662

Added: 
    

Modified: 
    mlir/lib/Transforms/Utils/LoopUtils.cpp
    mlir/test/Dialect/Affine/unroll-jam.mlir
    mlir/test/Dialect/Affine/unroll.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index c43ee3f70ea63..78df3d130f4a8 100644

--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,20 +18,15 @@
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Transforms/Utils.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -56,28 +51,21 @@ struct LoopParams {
 /// part of the unrolled loop. Computes the bound as an AffineMap with its
 /// operands or a null map when the trip count can't be expressed as an affine
 /// expression.
-static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
-                                     AffineMap &map,
-                                     SmallVectorImpl<Value> &operands) {
-  auto lbMap = forOp.getLowerBoundMap();
-
-  // Single result lower bound map only.
-  if (lbMap.getNumResults() != 1) {
-    map = AffineMap();
-    return;
-  }
-
+static void
+getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                         AffineMap &cleanupLbMap,
+                         SmallVectorImpl<Value> &cleanupLbOperands) {
   AffineMap tripCountMap;
   SmallVector<Value, 4> tripCountOperands;
   getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
-
-  // Sometimes the trip count cannot be expressed as an affine expression.
+  // Trip count can't be computed.
   if (!tripCountMap) {
-    map = AffineMap();
+    cleanupLbMap = AffineMap();
     return;
   }
 
   OpBuilder b(forOp);
+  auto lbMap = forOp.getLowerBoundMap();
   auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
                                     forOp.getLowerBoundOperands());
 
@@ -102,15 +90,15 @@ static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
   for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
     newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
 
-  operands.clear();
-  operands.push_back(lb);
-  operands.append(bumpValues.begin(), bumpValues.end());
-  map = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
-                       b.getContext());
-  // Simplify the map + operands.
-  fullyComposeAffineMapAndOperands(&map, &operands);
-  map = simplifyAffineMap(map);
-  canonicalizeMapAndOperands(&map, &operands);
+  cleanupLbOperands.clear();
+  cleanupLbOperands.push_back(lb);
+  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
+  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
+                                b.getContext());
+  // Simplify the cleanupLbMap + cleanupLbOperands.
+  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
+  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
+  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
   // Remove any affine.apply's that became dead from the simplification above.
   for (auto v : bumpValues)
     if (v.use_empty())
@@ -192,7 +180,7 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
     } else {
       auto lbOperands = forOp.getLowerBoundOperands();
       auto lbMap = forOp.getLowerBoundMap();
-      OpBuilder builder(parentBlock, Block::iterator(forOp));
+      OpBuilder builder(forOp);
       if (lbMap == builder.getDimIdentityMap()) {
         // No need of generating an affine.apply.
         iv.replaceAllUsesWith(lbOperands[0]);
@@ -1132,8 +1120,8 @@ static void generateUnrolledLoop(
 
 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
 /// count is not a multiple of `unrollFactor`.
-static void generateCleanupLoopForUnroll(AffineForOp forOp,
-                                         uint64_t unrollFactor) {
+static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
+                                                  uint64_t unrollFactor) {
   // Insert the cleanup loop right after 'forOp'.
   OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
   auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
@@ -1152,9 +1140,9 @@ static void generateCleanupLoopForUnroll(AffineForOp forOp,
   AffineMap cleanupMap;
   SmallVector<Value, 4> cleanupOperands;
   getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
-  assert(cleanupMap &&
-         "cleanup loop lower bound map for single result lower bound maps "
-         "can always be determined");
+  if (!cleanupMap)
+    return failure();
+
   cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
   // Promote the loop body up if this has turned into a single iteration loop.
   (void)promoteIfSingleIteration(cleanupForOp);
@@ -1162,6 +1150,7 @@ static void generateCleanupLoopForUnroll(AffineForOp forOp,
   // Adjust upper bound of the original loop; this is the same as the lower
   // bound of the cleanup loop.
   forOp.setUpperBound(cleanupOperands, cleanupMap);
+  return success();
 }
 
 /// Unrolls this loop by the specified factor. Returns success if the loop
@@ -1178,14 +1167,6 @@ LogicalResult mlir::loopUnrollByFactor(
   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
     return success();
 
-  // Loops where the lower bound is a max expression isn't supported for
-  // unrolling since the trip count can be expressed as an affine function when
-  // both the lower bound and the upper bound are multi-result maps. However,
-  // one meaningful way to do such unrolling would be to specialize the loop for
-  // the 'hotspot' case and unroll that hotspot.
-  if (forOp.getLowerBoundMap().getNumResults() != 1)
-    return failure();
-
   // If the trip count is lower than the unroll factor, no unrolled body.
   // TODO: option to specify cleanup loop unrolling.
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
@@ -1194,8 +1175,18 @@ LogicalResult mlir::loopUnrollByFactor(
     return failure();
 
   // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
-  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0)
-    generateCleanupLoopForUnroll(forOp, unrollFactor);
+  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
+    // Loops where the lower bound is a max expression or the upper bound is
+    // a min expression and the trip count doesn't divide the unroll factor
+    // can't be unrolled since the lower bound of the cleanup loop in such cases
+    // cannot be expressed as an affine function or a max over affine functions.
+    if (forOp.getLowerBoundMap().getNumResults() != 1 ||
+        forOp.getUpperBoundMap().getNumResults() != 1)
+      return failure();
+    if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
+      assert(false && "cleanup loop lower bound map for single result lower "
+                      "and upper bound maps can always be determined");
+  }
 
   ValueRange iterArgs(forOp.getRegionIterArgs());
   auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
@@ -1399,15 +1390,6 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
     return success();
 
-  // Loops where both lower and upper bounds are multi-result maps won't be
-  // unrolled (since the trip can't be expressed as an affine function in
-  // general).
-  // TODO: this may not be common, but we could support the case
-  // where the lower bound is a multi-result map and the ub is a single result
-  // one.
-  if (forOp.getLowerBoundMap().getNumResults() != 1)
-    return failure();
-
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   // If the trip count is lower than the unroll jam factor, no unroll jam.
   if (mayBeConstantTripCount.hasValue() &&
@@ -1440,8 +1422,18 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
 
   // Generate the cleanup loop if trip count isn't a multiple of
   // unrollJamFactor.
-  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0)
-    generateCleanupLoopForUnroll(forOp, unrollJamFactor);
+  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
+    // Loops where the lower bound is a max expression or the upper bound is
+    // a min expression and the trip count doesn't divide the unroll factor
+    // can't be unrolled since the lower bound of the cleanup loop in such cases
+    // cannot be expressed as an affine function or a max over affine functions.
+    if (forOp.getLowerBoundMap().getNumResults() != 1 ||
+        forOp.getUpperBoundMap().getNumResults() != 1)
+      return failure();
+    if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
+      assert(false && "cleanup loop lower bound map for single result lower "
+                      "and upper bound maps can always be determined");
+  }
 
   // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
   // iteration. There are (`unrollJamFactor` - 1) iterations.

diff  --git a/mlir/test/Dialect/Affine/unroll-jam.mlir b/mlir/test/Dialect/Affine/unroll-jam.mlir
index 4f208cda50747..36c2bc483f4e7 100644
--- a/mlir/test/Dialect/Affine/unroll-jam.mlir
+++ b/mlir/test/Dialect/Affine/unroll-jam.mlir
@@ -3,7 +3,6 @@
 
 // CHECK-DAG: [[$MAP_PLUS_1:#map[0-9]+]] = affine_map<(d0) -> (d0 + 1)>
 // CHECK-DAG: [[$MAP_DIV_OFFSET:#map[0-9]+]] = affine_map<()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)>
-// CHECK-DAG: [[$MAP_MULTI_RES:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 floordiv 2) * 2, (s1 floordiv 2) * 2, 1024)>
 // CHECK-DAG: [[$MAP_SYM_UB:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0, s1, 1024)>
 
 // UJAM-FOUR-DAG: [[$UBMAP:#map[0-9]+]] = affine_map<()[s0] -> (s0 + 8)>
@@ -104,21 +103,16 @@ func @loop_nest_unknown_count_2(%N : index) {
 func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
   affine.for %i = 0 to min affine_map<()[s0, s1] -> (s0, s1, 1024)>()[%M, %N] {
     affine.for %j = 0 to %K {
-      "foo"(%i, %j) : (index, index) -> ()
+      "test.foo"(%i, %j) : (index, index) -> ()
     }
   }
   return
 }
-// CHECK-NEXT:  affine.for [[IV0:%arg[0-9]+]] = 0 to min [[$MAP_MULTI_RES]]()[%[[M]], %[[N]]] step 2 {
+// No unroll-and-jam possible here as the lower bound for the cleanup loop won't
+// be representable.
+// CHECK-NEXT:  affine.for [[IV0:%arg[0-9]+]] = 0 to min #map{{.*}}()[%[[M]], %[[N]]] {
 // CHECK-NEXT:    affine.for [[IV1:%arg[0-9]+]] = 0 to %[[K]] {
-// CHECK-NEXT:      "foo"([[IV0]], [[IV1]])
-// CHECK-NEXT:      [[RES:%[0-9]+]] = affine.apply [[$MAP_PLUS_1]]([[IV0]])
-// CHECK-NEXT:      "foo"([[RES]], [[IV1]])
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
-// CHECK-NEXT:  affine.for [[IV0]] = max [[$MAP_MULTI_RES]]()[%[[M]], %[[N]]] to min [[$MAP_SYM_UB]]()[%[[M]], %[[N]]] {
-// CHECK-NEXT:    affine.for [[IV1]] = 0 to %[[K]] {
-// CHECK-NEXT:      "foo"([[IV0]], [[IV1]])
+// CHECK-NEXT:      "test.foo"([[IV0]], [[IV1]])
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return

diff  --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index fe7b8e4c10add..336e3313abab8 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -21,7 +21,6 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]+]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]+]] = affine_map<(d0) -> (d0)>
-// UNROLL-BY-4-DAG: [[$MAP_TRIP_COUNT_MULTIPLE_FOUR:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 + ((-s0 + s1) floordiv 4) * 4, s0 + ((-s0 + s2) floordiv 4) * 4, s0 + ((-s0) floordiv 4) * 4 + 1024)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func @loop_nest_simplest() {
@@ -558,20 +557,15 @@ func @loop_nest_symbolic_bound_with_step(%N : index) {
 // UNROLL-BY-4-LABEL: func @loop_nest_symbolic_and_min_upper_bound
 func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
   affine.for %i = %M to min affine_map<()[s0, s1] -> (s0, s1, 1024)>()[%N, %K] {
-    "foo"() : () -> ()
+    "test.foo"() : () -> ()
   }
   return
 }
-// CHECK-NEXT:  affine.for %arg0 = %arg0 to min [[$MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] step 4 {
-// CHECK-NEXT:    "foo"() : () -> ()
-// CHECK-NEXT:    "foo"() : () -> ()
-// CHECK-NEXT:    "foo"() : () -> ()
-// CHECK-NEXT:    "foo"() : () -> ()
-// CHECK-NEXT:  }
-// CHECK-NEXT:  affine.for %arg1 = max [[$MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] to min #map28()[%arg1, %arg2] {
-// CHECK-NEXT:    "foo"() : () -> ()
-// CHECK-NEXT:  }
-// CHECK-NEXT:  return
+// No unrolling here.
+// UNROLL-BY-4:      affine.for %{{.*}} = %{{.*}} to min #map{{.*}}()[%{{.*}}, %{{.*}}] {
+// UNROLL-BY-4-NEXT:   "test.foo"() : () -> ()
+// UNROLL-BY-4-NEXT: }
+// UNROLL-BY-4-NEXT: return
 
 // The trip count here is a multiple of four, but this can be inferred only
 // through composition. Check for no cleanup scf.
@@ -588,6 +582,28 @@ func @loop_nest_non_trivial_multiple_upper_bound(%M : index, %N : index) {
 // UNROLL-BY-4-NOT: for
 // UNROLL-BY-4: return
 
+// UNROLL-BY-4-LABEL: func @multi_upper_bound
+func @multi_upper_bound(%arg0: index) {
+  affine.for %i = 0 to min affine_map<()[s0] -> (8 * s0, 12 * s0)>()[%arg0] {
+    "test.foo"() : () -> ()
+  }
+  // No unrolling possible here.
+  // UNROLL-BY-4: affine.for %{{.*}} = 0 to min #map{{.*}}()[%{{.*}}]
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @multi_lower_bound
+func @multi_lower_bound(%arg0: index) {
+  affine.for %i = max affine_map<()[s0] -> (8 * s0, 12 * s0)>()[%arg0] to 100 {
+    "test.foo"() : () -> ()
+  }
+  // TODO: Extend getTripCountMapAndOperands to handle multi-result lower bound
+  // maps.
+  // UNROLL-BY-4: affine.for %{{.*}} = max #map{{.*}}()[%{{.*}}] to 100
+  // UNROLL-BY-4-NOT: affine.for
+  return
+}
+
 // UNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_upper_bound_alt
 func @loop_nest_non_trivial_multiple_upper_bound_alt(%M : index, %N : index) {
   %K = affine.apply affine_map<(d0) -> (4*d0)> (%M)