[llvm] b75530f - [LoopInterchange] Consider forward/backward dependency in vectorize heuristic (#133672)

Fri Jul 25 06:37:24 PDT 2025

Author: Ryotaro Kasuga
Date: 2025-07-25T22:37:20+09:00
New Revision: b75530ff034a131da8ca1f05a00f3655c13839ff

URL: https://github.com/llvm/llvm-project/commit/b75530ff034a131da8ca1f05a00f3655c13839ff
DIFF: https://github.com/llvm/llvm-project/commit/b75530ff034a131da8ca1f05a00f3655c13839ff.diff

LOG: [LoopInterchange] Consider forward/backward dependency in vectorize heuristic (#133672)

The vectorization heuristic of LoopInterchange attempts to move a
vectorizable loop to the innermost position. Before this patch, a loop
was deemed vectorizable if there are no loop-carried dependencies
induced by the loop.
This patch extends the vectorization heuristic by introducing the
concept of forward and backward dependencies, inspired by
LoopAccessAnalysis. Specifically, an additional element is appended to
each direction vector to indicate whether it represents a forward
dependency (`<`) or not (`*`). Among these, only the forward
dependencies (i.e., those whose last element is `<`) affect the
vectorization heuristic. Accordingly, the check is conservative, and
dependencies are considered forward only when this can be proven.
Currently, we only support perfectly nested loops whose body consists of
a single basic block. For other cases, dependencies are pessimistically
treated as non-forward.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopInterchange.cpp
    llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 70e9eee5339a7..08446ccaa9fca 100644

--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -70,6 +70,13 @@ namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
 
+/// A list of direction vectors. Each entry represents a direction vector
+/// corresponding to one or more dependencies existing in the loop nest. The
+/// length of all direction vectors is equal and is N + 1, where N is the depth
+/// of the loop nest. The first N elements correspond to the dependency
+/// direction of each N loops. The last one indicates whether this entry is
+/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
+/// is defined in LoopAccessAnalysis.
 // TODO: Check if we can use a sparse matrix here.
 using CharMatrix = std::vector<std::vector<char>>;
 
@@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
 
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
-    for (auto D : Row)
+    // Drop the last element because it is a flag indicating whether this is
+    // forward dependency or not, which doesn't affect the legality check.
+    for (char D : drop_end(Row))
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
   }
 }
+
+/// Return true if \p Src appears before \p Dst in the same basic block.
+/// Precondition: \p Src and \Dst are distinct instructions within the same
+/// basic block.
+static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && Src != Dst &&
+         "Expected Src and Dst to be 
diff erent instructions in the same BB");
+
+  bool FoundSrc = false;
+  for (const Instruction &I : *(Src->getParent())) {
+    if (&I == Src) {
+      FoundSrc = true;
+      continue;
+    }
+    if (&I == Dst)
+      return FoundSrc;
+  }
+
+  llvm_unreachable("Dst not found");
+}
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
@@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     return false;
   }
   ValueVector::iterator I, IE, J, JE;
-  StringSet<> Seen;
+
+  // Manage direction vectors that are already seen. Map each direction vector
+  // to an index of DepMatrix at which it is stored.
+  StringMap<unsigned> Seen;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
           Dep.push_back('I');
         }
 
+        // Test whether the dependency is forward or not.
+        bool IsKnownForward = true;
+        if (Src->getParent() != Dst->getParent()) {
+          // In general, when Src and Dst are in 
diff erent BBs, the execution
+          // order of them within a single iteration is not guaranteed. Treat
+          // conservatively as not-forward dependency in this case.
+          IsKnownForward = false;
+        } else {
+          // Src and Dst are in the same BB. If they are the 
diff erent
+          // instructions, Src should appear before Dst in the BB as they are
+          // stored to MemInstr in that order.
+          assert((Src == Dst || inThisOrder(Src, Dst)) &&
+                 "Unexpected instructions");
+
+          // If the Dependence object is reversed (due to normalization), it
+          // represents the dependency from Dst to Src, meaning it is a backward
+          // dependency. Otherwise it should be a forward dependency.
+          bool IsReversed = D->getSrc() != Src;
+          if (IsReversed)
+            IsKnownForward = false;
+        }
+
+        // Initialize the last element. Assume forward dependencies only; it
+        // will be updated later if there is any non-forward dependency.
+        Dep.push_back('<');
+
+        // The last element should express the "summary" among one or more
+        // direction vectors whose first N elements are the same (where N is
+        // the depth of the loop nest). Hence we exclude the last element from
+        // the Seen map.
+        auto [Ite, Inserted] = Seen.try_emplace(
+            StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
+
         // Make sure we only add unique entries to the dependency matrix.
-        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+        if (Inserted)
           DepMatrix.push_back(Dep);
+
+        // If we cannot prove that this dependency is forward, change the last
+        // element of the corresponding entry. Since a `[... *]` dependency
+        // includes a `[... <]` dependency, we do not need to keep both and
+        // change the existing entry instead.
+        if (!IsKnownForward)
+          DepMatrix[Ite->second].back() = '*';
       }
     }
   }
@@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
       continue;
 
     // Check if the direction vector is lexicographically positive (or zero)
-    // for both before/after exchanged.
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    // for both before/after exchanged. Ignore the last element because it
+    // doesn't affect the legality.
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
   }
   return true;
@@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
 static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
   for (const auto &Dep : DepMatrix) {
     char Dir = Dep[LoopId];
-    if (Dir != 'I' && Dir != '=')
-      return false;
+    char DepType = Dep.back();
+    assert((DepType == '<' || DepType == '*') &&
+           "Unexpected element in dependency vector");
+
+    // There are no loop-carried dependencies.
+    if (Dir == '=' || Dir == 'I')
+      continue;
+
+    // DepType being '<' means that this direction vector represents a forward
+    // dependency. In principle, a loop with '<' direction can be vectorized in
+    // this case.
+    if (Dir == '<' && DepType == '<')
+      continue;
+
+    // We cannot prove that the loop is vectorizable.
+    return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
     unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
-  // If the outer loop is not loop independent it is not profitable to move
-  // this to inner position, since doing so would not enable inner loop
-  // parallelism.
+  // If the outer loop cannot be vectorized, it is not profitable to move this
+  // to inner position.
   if (!canVectorize(DepMatrix, OuterLoopId))
     return false;
 
-  // If inner loop has dependence and outer loop is loop independent then it is
-  // profitable to interchange to enable inner loop parallelism.
+  // If the inner loop cannot be vectorized but the outer loop can be, then it
+  // is profitable to interchange to enable inner loop parallelism.
   if (!canVectorize(DepMatrix, InnerLoopId))
     return true;
 

diff  --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 9c113d4570e4d..4194849784054 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,6 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
+ at D = global [256 x [256 x [256 x float]]] zeroinitializer
+ at E = global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -64,15 +66,13 @@ exit:
 ;   for (int j = 1; j < 256; j++)
 ;     A[i][j-1] = A[i][j] + B[i][j];
 ;
-; FIXME: These loops are exchanged at this time due to the problem in
-; profitability heuristic calculation for vectorization.
 
-; CHECK:      --- !Passed
+; CHECK:      --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_unnecesasry_for_vectorization
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:   - String:          Insufficient information to calculate the cost of loop for interchange.
 define void @interchange_unnecesasry_for_vectorization() {
 entry:
   br label %for.i.header
@@ -103,3 +103,134 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that the below loops are exchanged to allow innermost loop
+; vectorization. We cannot vectorize the j-loop because it has a lexically
+; backward dependency, but the i-loop can be vectorized because all the
+; loop-carried dependencies are lexically forward. LoopVectorize currently only
+; vectorizes innermost loop, hence move the i-loop to that position.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 1; j < 256; j++) {
+;     A[i][j] = A[i][j-1] + B[i][j];
+;     C[i][j] += C[i+1][j];
+;   }
+; }
+;
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_necessary_for_vectorization2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+define void @interchange_necessary_for_vectorization2() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
+  %i.inc = add i64 %i, 1
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add i64 %j, -1
+  %a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec
+  %b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j
+  %c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index
+  %b = load float, ptr %b.index
+  %c0 = load float, ptr %c.load.index
+  %c1 = load float, ptr %c.store.index
+  %add.0 = fadd float %a, %b
+  %a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index
+  %add.1 = fadd float %c0, %c1
+  store float %add.1, ptr %c.store.index
+  %j.next = add i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+; Check that no interchange is performed for the following loop. Interchanging
+; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
+; has only forward dependencies. However, at the moment, a loop body consisting
+; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
+; the innermost place.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 0; j < 255; j++) {
+;     for (int k = 0; k < 128; k++) {
+;       E[i][j][k] = D[i+1][j+1][2*k];
+;       if (cond)
+;         D[i][j][k+1] = 1.0;
+;   }
+; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+define void @multiple_BBs_in_loop() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %i.inc = add i64 %i, 1
+  br label %for.j.header
+
+for.j.header:
+  %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
+  %j.inc = add i64 %j, 1
+  br label %for.k.body
+
+for.k.body:
+  %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
+  %k.inc = add i64 %k, 1
+  %k.2 = mul i64 %k, 2
+  %d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index
+  store float %d.load, ptr %e.index
+  %cond = freeze i1 undef
+  br i1 %cond, label %if.then, label %for.k.inc
+
+if.then:
+  %d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc
+  store float 1.0, ptr %d.index2
+  br label %for.k.inc
+
+for.k.inc:
+  %cmp.k = icmp eq i64 %k.inc, 128
+  br i1 %cmp.k, label %for.j.inc, label %for.k.body
+
+for.j.inc:
+  %cmp.j = icmp eq i64 %j.inc, 255
+  br i1 %cmp.j, label %for.i.inc, label %for.j.header
+
+for.i.inc:
+  %cmp.i = icmp eq i64 %i.inc, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}