[Mlir-commits] [mlir] computeSliceParameters: use full slice if affine exprs are non-monotonic (PR #118502)

Tue Dec 3 07:49:01 PST 2024

https://github.com/mgehre-amd created https://github.com/llvm/llvm-project/pull/118502

`computeSliceParameters` maps the loop range (given by lower bounds `lbs` and upper bounds `ubs`) into the operand tensor to compute the input slices that contains all indexed values.

The computation currently assumes that the affine expressions to map from loop iteration domain to tensor domain are monotonically increasing, and uses that information by just evaluating those at `lbs` and `ubs` to obtain the slice of the operand tensor. For non-monotonic expressions, the maximum and minimum values might not be at the boundaries. Here, we detect that case, and then use the full slice to be safe.

Fixes #111830 and is related to #113021.

If shapes are static, we could be cleverer: sometimes we can prove that even though the affine expression is not monotonic in itself, it is monotonic on the range `(lbs, ubs)`. For example when the affine expression contains a `d0 mod 8`, and the range is `(0, 4)`. This can be a follow up PR.

>From fd8b15dfdc51fa66950e1a6b594854bac458c8c6 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre at amd.com>
Date: Tue, 3 Dec 2024 13:31:31 +0100
Subject: [PATCH] computeSliceParameters: use full slice if affine exprs are
 non-monotonic

`computeSliceParameters` maps the loop range (given by lower bounds `lbs` and upper bounds `ubs`) into the operand tensor to compute the input slices that contains all indexed values.

The computation currently assumes that the affine expressions to map from loop iteration domain to tensor domain are monotonically increasing, and use that information by just evaluating those at `lbs` and `ubs` to obtain the slice of the operand tensor.
For non-monotonic expressions, the maximum and minimum values might not be at the boundaries.
Here, we detect that case, and then use the full slice to be safe.

Fixes #111830 and is related to #113021.

If shapes are static, we could be cleverer: sometimes we can prove that even though the affine expression is not monotonic in itself, it is monotonic on the range `(lbs, ubs)`. For example when the affine expression contains a `d0 mod 8`, and the range is `(0, 4)`. This can be a follow up PR.
---
 mlir/include/mlir/IR/AffineExpr.h          |  5 +++
 mlir/include/mlir/IR/AffineMap.h           |  4 +++
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp    |  7 +++--
 mlir/lib/IR/AffineExpr.cpp                 | 36 ++++++++++++++++++++++
 mlir/lib/IR/AffineMap.cpp                  |  5 +++
 mlir/test/Dialect/Linalg/tile-tensors.mlir | 32 +++++++++++++++++++
 6 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
index a93e74b449ceed..28d00f1299f2f9 100644
--- a/mlir/include/mlir/IR/AffineExpr.h
+++ b/mlir/include/mlir/IR/AffineExpr.h
@@ -110,6 +110,11 @@ class AffineExpr {
   /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
   bool isPureAffine() const;
 
+  /// Returns true if this expression is monotonicically increasing with respect
+  /// to the AffineDimExprs, i.e. increasing the value of any AffineDimExpr will
+  /// never decrease the value of the result.
+  bool isMonotonicallyIncreasing() const;
+
   /// Returns the greatest known integral divisor of this affine expression. The
   /// result is always positive.
   int64_t getLargestKnownDivisor() const;
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 4bc40a7d4091a9..6a4d9f690f8bb7 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -382,6 +382,10 @@ class AffineMap {
   /// Returns true if the AffineMap represents a symbol-less permutation map.
   bool isPermutation() const;
 
+  // Returns true if every result is monotonically increasing.
+  // See AffineExpr::isMonotonicallyIncreasing().
+  bool isComponentWiseMonotonicallyIncreasing() const;
+
   /// Returns the map consisting of the `resultPos` subset.
   AffineMap getSubMap(ArrayRef<unsigned> resultPos) const;
 
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 38e427af1c4846..8712aadcfad62e 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -581,7 +581,11 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
   sliceParams.strides.reserve(rank);
   for (unsigned r = 0; r < rank; ++r) {
     LLVM_DEBUG(llvm::dbgs() << "computeSliceParameters: for dim#" << r);
-    if (!isTiled(map.getSubMap({r}), tileSizes)) {
+    auto m = map.getSubMap({r});
+    // The offset & size computation below only handles the case when
+    // the map is monotonically increasing, i.e. the min and max values are
+    // attained at the lower and upper bounds of the iteration domain.
+    if (!isTiled(m, tileSizes) || !m.isComponentWiseMonotonicallyIncreasing()) {
       sliceParams.offsets.push_back(builder.getIndexAttr(0));
       OpFoldResult dim = createFoldedDimOp(builder, loc, valueToTile, r);
       sliceParams.sizes.push_back(dim);
@@ -593,7 +597,6 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
 
     // Tiling creates a new slice at the proper index, the slice step is 1
     // (i.e. the op does not subsample, stepping occurs in the loop).
-    auto m = map.getSubMap({r});
     LLVM_DEBUG(llvm::dbgs() << "computeSliceParameters: submap: " << m << "\n");
     IRRewriter rewriter(builder);
     OpFoldResult offset = makeComposedFoldedAffineApply(rewriter, loc, m, lbs);
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 2291d64c50a560..d1ec15048758cb 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -239,6 +239,42 @@ bool AffineExpr::isPureAffine() const {
   llvm_unreachable("Unknown AffineExpr");
 }
 
+static bool isNonNegativeConstant(AffineExpr expr) {
+  auto constant = dyn_cast<AffineConstantExpr>(expr);
+  return constant && constant.getValue() >= 0;
+}
+
+bool AffineExpr::isMonotonicallyIncreasing() const {
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+  case AffineExprKind::DimId:
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::Add: {
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           op.getRHS().isMonotonicallyIncreasing();
+  }
+  case AffineExprKind::Mul: {
+    // One operand must be a non-negative constant.
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           op.getRHS().isMonotonicallyIncreasing() &&
+           (isNonNegativeConstant(op.getLHS()) ||
+            isNonNegativeConstant(op.getRHS()));
+  }
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv: {
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           isNonNegativeConstant(op.getRHS());
+  }
+  case AffineExprKind::Mod:
+    return false;
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
 // Returns the greatest known integral divisor of this affine expression.
 int64_t AffineExpr::getLargestKnownDivisor() const {
   AffineBinaryOpExpr binExpr(nullptr);
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 719a81ec057f9f..5174e038461630 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -651,6 +651,11 @@ bool AffineMap::isPermutation() const {
   return isProjectedPermutation();
 }
 
+bool AffineMap::isComponentWiseMonotonicallyIncreasing() const {
+  return all_of(getResults(),
+                [](auto expr) { return expr.isMonotonicallyIncreasing(); });
+}
+
 AffineMap AffineMap::getSubMap(ArrayRef<unsigned> resultPos) const {
   SmallVector<AffineExpr, 4> exprs;
   exprs.reserve(resultPos.size());
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index 8f13c690704572..cacca8a637ab77 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -167,3 +167,35 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @non_monotonic_affine_expr
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?xf32>
+func.func @non_monotonic_affine_expr(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %empty = tensor.empty(%0) : tensor<?xf32>
+
+  // CHECK: scf.for
+  // CHECK: %[[SIZE:[a-zA-Z0-9_]+]] = tensor.dim %[[ARG0]],
+  // CHECK: tensor.extract_slice %[[ARG0]][0] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
+  %generic = linalg.generic
+    {indexing_maps = [affine_map<(d0) -> (d0 mod 3)>,
+                      affine_map<(d0) -> (d0)>],
+     iterator_types = ["parallel"]}
+    ins(%arg0: tensor<?xf32>)
+    outs(%empty : tensor<?xf32>) {
+    ^bb0(%in : f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?xf32>
+  return %generic : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loop = transform.structured.tile_using_for %0 tile_sizes [100] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}