[Mlir-commits] [mlir] [MLIR][Affine] Fix/complete access index invariance, add isInvariantAccess (PR #84602)

Fri Mar 8 20:42:32 PST 2024

https://github.com/bondhugula updated https://github.com/llvm/llvm-project/pull/84602

>From 2bdd36fe05ca6f09726b4b725478d8bf5a37ba64 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday at polymagelabs.com>
Date: Sat, 9 Mar 2024 08:52:03 +0530
Subject: [PATCH] [MLIR][Affine] Fix/complete access index invariance, add
 isInvariantAccess

isAccessIndexInvariant had outdated code and didn't handle IR with multiple
affine.apply ops, which is inconvenient when used as a utility.  This is
addressed by switching to use the proper API on AffineValueMap. Add
mlir::affine::isInvariantAccess exposed for outside use and tested via
the test pass. Add a method on AffineValueMap.  Add test cases to
exercise simplification and composition for invariant access analysis.

A TODO/FIXME has been added but this issue existed before.
---
 .../Dialect/Affine/Analysis/LoopAnalysis.h    |  5 ++
 .../mlir/Dialect/Affine/IR/AffineValueMap.h   |  5 ++
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  | 59 ++++++++-----------
 mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp |  9 +++
 mlir/test/Dialect/Affine/access-analysis.mlir | 33 ++++++++++-
 .../lib/Dialect/Affine/TestAccessAnalysis.cpp | 19 ++++--
 6 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 1f64b57cac5782..7b92b930fb5f57 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -48,6 +48,11 @@ std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 /// this method is thus able to determine non-trivial divisors.
 uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
 
+/// Checks if an affine read or write operation depends on `forOp`'s IV, i.e.,
+/// if the memory access is invariant on `forOp`.
+template <typename LoadOrStoreOp>
+bool isInvariantAccess(LoadOrStoreOp memOp, AffineForOp forOp);
+
 /// Given an induction variable `iv` of type AffineForOp and `indices` of type
 /// IndexType, returns the set of `indices` that are independent of `iv`.
 ///
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
index 8439930a87467c..7ad0e4a1e5ea04 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
@@ -44,6 +44,11 @@ class AffineValueMap {
   // Resets this AffineValueMap with 'map', 'operands', and 'results'.
   void reset(AffineMap map, ValueRange operands, ValueRange results = {});
 
+  /// Composes all incoming affine.apply ops and then simplifies and
+  /// canonicalizes the map and operands. This can change the number of
+  /// operands, but the result count remains the same.
+  void composeSimplifyAndCanonicalize();
+
   /// Return the value map that is the difference of value maps 'a' and 'b',
   /// represented as an affine map and its operands. The output map + operands
   /// are canonicalized and simplified.
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index fc0515ba95f4fe..1c28d6b00b3c81 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -145,45 +145,36 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
   return *gcd;
 }
 
-/// Given an induction variable `iv` of type AffineForOp and an access `index`
-/// of type index, returns `true` if `index` is independent of `iv` and
-/// false otherwise. The determination supports composition with at most one
-/// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that
-/// the composition of AffineApplyOp needs to be canonicalized by construction
-/// to avoid writing code that composes arbitrary numbers of AffineApplyOps
-/// everywhere. To achieve this, at the very least, the compose-affine-apply
-/// pass must have been run.
+/// Given an affine.for `iv` and an access `index` of type index, returns `true`
+/// if `index` is independent of `iv` and false otherwise.
 ///
-/// Prerequisites:
-///   1. `iv` and `index` of the proper type;
-///   2. at most one reachable AffineApplyOp from index;
-///
-/// Returns false in cases with more than one AffineApplyOp, this is
-/// conservative.
+/// Prerequisites: `iv` and `index` of the proper type;
 static bool isAccessIndexInvariant(Value iv, Value index) {
-  assert(isAffineForInductionVar(iv) && "iv must be a AffineForOp");
-  assert(isa<IndexType>(index.getType()) && "index must be of IndexType");
-  SmallVector<Operation *, 4> affineApplyOps;
-  getReachableAffineApplyOps({index}, affineApplyOps);
-
-  if (affineApplyOps.empty()) {
-    // Pointer equality test because of Value pointer semantics.
-    return index != iv;
-  }
-
-  if (affineApplyOps.size() > 1) {
-    affineApplyOps[0]->emitRemark(
-        "CompositionAffineMapsPass must have been run: there should be at most "
-        "one AffineApplyOp, returning false conservatively.");
-    return false;
-  }
+  assert(isAffineForInductionVar(iv) && "iv must be an affine.for iv");
+  assert(isa<IndexType>(index.getType()) && "index must be of 'index' type");
+  auto map = AffineMap::getMultiDimIdentityMap(/*numDims=*/1, iv.getContext());
+  SmallVector<Value> operands = {index};
+  AffineValueMap avm(map, operands);
+  avm.composeSimplifyAndCanonicalize();
+  return !avm.isFunctionOf(0, iv);
+}
 
-  auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
-  // We need yet another level of indirection because the `dim` index of the
-  // access may not correspond to the `dim` index of composeOp.
-  return !composeOp.getAffineValueMap().isFunctionOf(0, iv);
+// Pre-requisite: Loop bounds should be in canonical form.
+template <typename LoadOrStoreOp>
+bool mlir::affine::isInvariantAccess(LoadOrStoreOp memOp, AffineForOp forOp) {
+  AffineValueMap avm(memOp.getAffineMap(), memOp.getMapOperands());
+  avm.composeSimplifyAndCanonicalize();
+  return !llvm::is_contained(avm.getOperands(), forOp.getInductionVar());
 }
 
+// Explicitly instantiate the template so that the compiler knows we need them.
+template bool mlir::affine::isInvariantAccess(AffineReadOpInterface,
+                                              AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineWriteOpInterface,
+                                              AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineLoadOp, AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineStoreOp, AffineForOp);
+
 DenseSet<Value> mlir::affine::getInvariantAccesses(Value iv,
                                                    ArrayRef<Value> indices) {
   DenseSet<Value> res;
diff --git a/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
index 2800237fd05ac6..6a52849186872e 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
@@ -24,6 +24,15 @@ void AffineValueMap::reset(AffineMap map, ValueRange operands,
   this->results.assign(results.begin(), results.end());
 }
 
+void AffineValueMap::composeSimplifyAndCanonicalize() {
+  AffineMap sMap = getAffineMap();
+  fullyComposeAffineMapAndOperands(&sMap, &operands);
+  // Full composition also canonicalizes and simplifies before returning. We
+  // need to canonicalize once more to drop unused operands.
+  canonicalizeMapAndOperands(&sMap, &operands);
+  this->map.reset(sMap);
+}
+
 void AffineValueMap::difference(const AffineValueMap &a,
                                 const AffineValueMap &b, AffineValueMap *res) {
   assert(a.getNumResults() == b.getNumResults() && "invalid inputs");
diff --git a/mlir/test/Dialect/Affine/access-analysis.mlir b/mlir/test/Dialect/Affine/access-analysis.mlir
index 68310b9323535a..789de646a8f9e2 100644
--- a/mlir/test/Dialect/Affine/access-analysis.mlir
+++ b/mlir/test/Dialect/Affine/access-analysis.mlir
@@ -1,13 +1,14 @@
 // RUN: mlir-opt %s -split-input-file -test-affine-access-analysis -verify-diagnostics | FileCheck %s
 
-// CHECK-LABEL: func @loop_1d
-func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-LABEL: func @loop_simple
+func.func @loop_simple(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %M = memref.dim %A, %c0 : memref<?x?xf32>
    affine.for %i = 0 to %M {
      affine.for %j = 0 to %M {
        affine.load %A[%c0, %i] : memref<?x?xf32>
        // expected-remark at above {{contiguous along loop 0}}
+       // expected-remark at above {{invariant along loop 1}}
        affine.load %A[%c0, 8 * %i + %j] : memref<?x?xf32>
        // expected-remark at above {{contiguous along loop 1}}
        // Note/FIXME: access stride isn't being checked.
@@ -15,6 +16,7 @@ func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
        // These are all non-contiguous along both loops. Nothing is emitted.
        affine.load %A[%i, %c0] : memref<?x?xf32>
+       // expected-remark at above {{invariant along loop 1}}
        // Note/FIXME: access stride isn't being checked.
        affine.load %A[%i, 8 * %j] : memref<?x?xf32>
        // expected-remark at above {{contiguous along loop 1}}
@@ -27,6 +29,22 @@ func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @loop_unsimplified
+func.func @loop_unsimplified(%A : memref<100xf32>) {
+   affine.for %i = 0 to 100 {
+     affine.load %A[2 * %i - %i - %i] : memref<100xf32>
+     // expected-remark at above {{invariant along loop 0}}
+
+     %m = affine.apply affine_map<(d0) -> (-2 * d0)>(%i)
+     %n = affine.apply affine_map<(d0) -> (2 * d0)>(%i)
+     affine.load %A[(%m + %n) floordiv 2] : memref<100xf32>
+     // expected-remark at above {{invariant along loop 0}}
+   }
+   return
+}
+
+// -----
+
 #map = affine_map<(d0) -> (d0 * 16)>
 #map1 = affine_map<(d0) -> (d0 * 16 + 16)>
 #map2 = affine_map<(d0) -> (d0)>
@@ -41,11 +59,19 @@ func.func @tiled(%arg0: memref<*xf32>) {
         %alloc_0 = memref.alloc() : memref<1x16x1x16xf32>
         affine.for %arg4 = #map(%arg1) to #map1(%arg1) {
           affine.for %arg5 = #map(%arg3) to #map1(%arg3) {
+            // TODO: here and below, the access isn't really invariant
+            // along tile-space IVs where the intra-tile IVs' bounds
+            // depend on them.
             %0 = affine.load %cast[%arg4] : memref<64xf32>
             // expected-remark at above {{contiguous along loop 3}}
+            // expected-remark at above {{invariant along loop 0}}
+            // expected-remark at above {{invariant along loop 1}}
+            // expected-remark at above {{invariant along loop 2}}
+            // expected-remark at above {{invariant along loop 4}}
             affine.store %0, %alloc_0[0, %arg1 * -16 + %arg4, 0, %arg3 * -16 + %arg5] : memref<1x16x1x16xf32>
             // expected-remark at above {{contiguous along loop 4}}
             // expected-remark at above {{contiguous along loop 2}}
+            // expected-remark at above {{invariant along loop 1}}
           }
         }
         affine.for %arg4 = #map(%arg1) to #map1(%arg1) {
@@ -56,6 +82,9 @@ func.func @tiled(%arg0: memref<*xf32>) {
               // expected-remark at above {{contiguous along loop 2}}
               affine.store %0, %alloc[0, %arg5, %arg6, %arg4] : memref<1x224x224x64xf32>
               // expected-remark at above {{contiguous along loop 3}}
+              // expected-remark at above {{invariant along loop 0}}
+              // expected-remark at above {{invariant along loop 1}}
+              // expected-remark at above {{invariant along loop 2}}
             }
           }
         }
diff --git a/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp b/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
index b38046299d504a..751302550092d7 100644
--- a/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
@@ -59,18 +59,25 @@ void TestAccessAnalysis::runOnOperation() {
       enclosingOps.clear();
       getAffineForIVs(*memOp, &enclosingOps);
       for (unsigned d = 0, e = enclosingOps.size(); d < e; d++) {
+        AffineForOp loop = enclosingOps[d];
         int memRefDim;
-        bool isContiguous;
+        bool isContiguous, isInvariant;
         if (auto read = dyn_cast<AffineReadOpInterface>(memOp)) {
-          isContiguous = isContiguousAccess(enclosingOps[d].getInductionVar(),
-                                            read, &memRefDim);
+          isContiguous =
+              isContiguousAccess(loop.getInductionVar(), read, &memRefDim);
+          isInvariant = isInvariantAccess(read, loop);
         } else {
-          isContiguous = isContiguousAccess(enclosingOps[d].getInductionVar(),
-                                            cast<AffineWriteOpInterface>(memOp),
-                                            &memRefDim);
+          auto write = cast<AffineWriteOpInterface>(memOp);
+          isContiguous =
+              isContiguousAccess(loop.getInductionVar(), write, &memRefDim);
+          isInvariant = isInvariantAccess(write, loop);
         }
+        // Check for contiguity for the innermost memref dimension to avoid
+        // emitting too many diagnostics.
         if (isContiguous && memRefDim == 0)
           memOp->emitRemark("contiguous along loop ") << d << '\n';
+        if (isInvariant)
+          memOp->emitRemark("invariant along loop ") << d << '\n';
       }
     }
   }