[llvm] [LoopInterchange] Improve profitability check for vectorization (PR #133672)

Ryotaro Kasuga via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 24 02:48:31 PDT 2025


https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/133672

>From b1c4248c4a242698feedb6d69d61d07dbcca407c Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 27 Mar 2025 10:45:26 +0000
Subject: [PATCH 1/7] [LoopInterchange] Improve profitability check for
 vectorization

The vectorization profitability has a process to check whether a given
loop can be vectorized or not. Since the process is conservative, a loop
that can be vectorized may be deemed not to be possible. This can
trigger unnecessary exchanges.
This patch improves the profitability decision by mitigating such
misjudgments. Before this patch, we considered a loop to be vectorizable
only when there are no loop carried dependencies with the IV of the
loop. However, a loop carried dependency doesn't prevent vectorization
if the distance is positive. This patch makes the vectorization check
more accurate by allowing a loop with the positive dependency. Note that
it is difficult to make a complete decision whether a loop can be
vectorized or not. To achieve this, we must check the vector width and
the distance of dependency.
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 128 ++++++++++++++----
 .../profitability-vectorization-heuristic.ll  |   8 +-
 2 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1dccba4cfa7b8..078da53c52b52 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -80,6 +80,21 @@ enum class RuleTy {
   ForVectorization,
 };
 
+/// Store the information about if corresponding direction vector was negated
+/// by normalization or not. This is necessary to restore the original one from
+/// a row of a dependency matrix, because we only manage normalized direction
+/// vectors and duplicate vectors are eliminated. So there may be both original
+/// and negated vectors for a single entry (a row of dependency matrix). E.g.,
+/// if there are two direction vectors `[< =]` and `[> =]`, the later one will
+/// be converted to the same as former one by normalization, so only `[< =]`
+/// would be retained in the final result.
+struct NegatedStatus {
+  bool Original = false;
+  bool Negated = false;
+
+  bool isNonNegativeDir(char Dir) const;
+};
+
 } // end anonymous namespace
 
 // Minimum loop depth supported.
@@ -126,9 +141,10 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
 }
 #endif
 
-static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
-                                     Loop *L, DependenceInfo *DI,
-                                     ScalarEvolution *SE,
+static bool populateDependencyMatrix(CharMatrix &DepMatrix,
+                                     std::vector<NegatedStatus> &NegStatusVec,
+                                     unsigned Level, Loop *L,
+                                     DependenceInfo *DI, ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
   using ValueVector = SmallVector<Value *, 16>;
 
@@ -167,7 +183,9 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     return false;
   }
   ValueVector::iterator I, IE, J, JE;
-  StringSet<> Seen;
+
+  // Manage all found direction vectors. and map it to the index of DepMatrix.
+  StringMap<unsigned> Seen;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -182,7 +200,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
         // If the direction vector is negative, normalize it to
         // make it non-negative.
-        if (D->normalize(SE))
+        bool Normalized = D->normalize(SE);
+        if (Normalized)
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
@@ -214,8 +233,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         }
 
         // Make sure we only add unique entries to the dependency matrix.
-        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+        unsigned Index = DepMatrix.size();
+        auto [Ite, Inserted] =
+            Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index);
+        if (Inserted) {
           DepMatrix.push_back(Dep);
+          NegStatusVec.push_back(NegatedStatus{});
+        } else
+          Index = Ite->second;
+
+        NegatedStatus &Status = NegStatusVec[Index];
+        (Normalized ? Status.Negated : Status.Original) = true;
       }
     }
   }
@@ -400,6 +428,7 @@ class LoopInterchangeProfitability {
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
+                    const std::vector<NegatedStatus> &NegStatusVec,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -409,9 +438,10 @@ class LoopInterchangeProfitability {
       const DenseMap<const Loop *, unsigned> &CostMap,
       std::unique_ptr<CacheCost> &CC);
   std::optional<bool> isProfitablePerInstrOrderCost();
-  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
-                                                   unsigned OuterLoopId,
-                                                   CharMatrix &DepMatrix);
+  std::optional<bool>
+  isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
+                               CharMatrix &DepMatrix,
+                               const std::vector<NegatedStatus> &NegStatusVec);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -503,8 +533,9 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
+    std::vector<NegatedStatus> NegStatusVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -543,8 +574,8 @@ struct LoopInterchange {
     for (unsigned j = SelecLoopId; j > 0; j--) {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
-        bool Interchanged =
-            processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
+        bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
+                                        NegStatusVec, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -559,6 +590,8 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
+
+                   const std::vector<NegatedStatus> &NegStatusVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -572,7 +605,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap, CC)) {
+                          DependencyMatrix, NegStatusVec, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1197,27 +1230,71 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   return std::nullopt;
 }
 
+static char flipDirection(char Dir) {
+  switch (Dir) {
+  case '<':
+    return '>';
+  case '>':
+    return '<';
+  case '=':
+  case 'I':
+  case '*':
+    return Dir;
+  default:
+    llvm_unreachable("Unknown direction");
+  }
+}
+
+/// Ensure that there are no negative direction dependencies corresponding to \p
+/// Dir.
+bool NegatedStatus::isNonNegativeDir(char Dir) const {
+  assert((Original || Negated) && "Cannot restore the original direction");
+
+  // If both flag is true, it means that there is both as-is and negated
+  // direction. In this case only `=` or `I` don't have negative direction
+  // dependency.
+  if (Original && Negated)
+    return Dir == '=' || Dir == 'I';
+
+  char Restored = Negated ? flipDirection(Dir) : Dir;
+  return Restored == '=' || Restored == 'I' || Restored == '<';
+}
+
 /// Return true if we can vectorize the loop specified by \p LoopId.
-static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
+static bool canVectorize(const CharMatrix &DepMatrix,
+                         const std::vector<NegatedStatus> &NegStatusVec,
+                         unsigned LoopId) {
+  // The loop can be vectorized if there are no negative dependencies. Consider
+  // the dependency of `j` in the following example.
+  //
+  //   Positive: ... = A[i][j]       Negative: ... = A[i][j-1]
+  //             A[i][j-1] = ...               A[i][j] = ...
+  //
+  // In the right case, vectorizing the loop can change the loaded value from
+  // `A[i][j-1]`. At the moment we don't take into account the distance of the
+  // dependency and vector width.
+  // TODO: Considering the dependency distance and the vector width can give a
+  // more accurate result. For example, the following loop can be vectorized if
+  // the vector width is less than or equal to 4 x sizeof(A[0][0]).
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (Dir != 'I' && Dir != '=')
+    if (!NegStatusVec[I].isNonNegativeDir(Dir))
       return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
-    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
-  // If the outer loop is not loop independent it is not profitable to move
-  // this to inner position, since doing so would not enable inner loop
-  // parallelism.
-  if (!canVectorize(DepMatrix, OuterLoopId))
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const std::vector<NegatedStatus> &NegStatusVec) {
+  // If the outer loop cannot be vectorized, it is not profitable to move this
+  // to inner position.
+  if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId))
     return false;
 
-  // If inner loop has dependence and outer loop is loop independent then it is
+  // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, InnerLoopId))
+  if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1231,6 +1308,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
     unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const std::vector<NegatedStatus> &NegStatusVec,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1252,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable(
       shouldInterchange = isProfitablePerInstrOrderCost();
       break;
     case RuleTy::ForVectorization:
-      shouldInterchange =
-          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+      shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
+                                                       DepMatrix, NegStatusVec);
       break;
     }
 
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 0f5aee582373d..14c2046eebbb4 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -64,15 +64,13 @@ exit:
 ;   for (int j = 1; j < 256; j++)
 ;     A[i][j-1] = A[i][j] + B[i][j];
 ;
-; FIXME: These loops are exchanged at this time due to the problem in
-; profitability heuristic calculation for vectorization.
 
-; CHECK:      --- !Passed
+; CHECK:      --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_unnecesasry_for_vectorization
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:   - String:          Insufficient information to calculate the cost of loop for interchange.
 define void @interchange_unnecesasry_for_vectorization() {
 entry:
   br label %for.i.header

>From 8f4f814b01d2ad5cab1962513adc8bf7deeec012 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 3 Apr 2025 09:55:13 +0000
Subject: [PATCH 2/7] Handle negated and non negated direction vectors
 separately.

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 90 ++++++-------------
 1 file changed, 27 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 078da53c52b52..fe33ee33258f1 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -80,21 +81,6 @@ enum class RuleTy {
   ForVectorization,
 };
 
-/// Store the information about if corresponding direction vector was negated
-/// by normalization or not. This is necessary to restore the original one from
-/// a row of a dependency matrix, because we only manage normalized direction
-/// vectors and duplicate vectors are eliminated. So there may be both original
-/// and negated vectors for a single entry (a row of dependency matrix). E.g.,
-/// if there are two direction vectors `[< =]` and `[> =]`, the later one will
-/// be converted to the same as former one by normalization, so only `[< =]`
-/// would be retained in the final result.
-struct NegatedStatus {
-  bool Original = false;
-  bool Negated = false;
-
-  bool isNonNegativeDir(char Dir) const;
-};
-
 } // end anonymous namespace
 
 // Minimum loop depth supported.
@@ -142,9 +128,9 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix,
-                                     std::vector<NegatedStatus> &NegStatusVec,
-                                     unsigned Level, Loop *L,
-                                     DependenceInfo *DI, ScalarEvolution *SE,
+                                     BitVector &IsNegatedVec, unsigned Level,
+                                     Loop *L, DependenceInfo *DI,
+                                     ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
   using ValueVector = SmallVector<Value *, 16>;
 
@@ -184,8 +170,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
   }
   ValueVector::iterator I, IE, J, JE;
 
-  // Manage all found direction vectors. and map it to the index of DepMatrix.
-  StringMap<unsigned> Seen;
+  // Manage all found direction vectors, negated and not negated, separately.
+  StringSet<> Seen[2];
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -233,17 +219,12 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
         }
 
         // Make sure we only add unique entries to the dependency matrix.
-        unsigned Index = DepMatrix.size();
-        auto [Ite, Inserted] =
-            Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index);
-        if (Inserted) {
+        // Negated vectors (due to normalization) are treated as separate from
+        // non negated ones.
+        if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) {
           DepMatrix.push_back(Dep);
-          NegStatusVec.push_back(NegatedStatus{});
-        } else
-          Index = Ite->second;
-
-        NegatedStatus &Status = NegStatusVec[Index];
-        (Normalized ? Status.Negated : Status.Original) = true;
+          IsNegatedVec.push_back(Normalized);
+        }
       }
     }
   }
@@ -427,8 +408,7 @@ class LoopInterchangeProfitability {
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix,
-                    const std::vector<NegatedStatus> &NegStatusVec,
+                    CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -441,7 +421,7 @@ class LoopInterchangeProfitability {
   std::optional<bool>
   isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
                                CharMatrix &DepMatrix,
-                               const std::vector<NegatedStatus> &NegStatusVec);
+                               const BitVector &IsNegatedVec);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -533,9 +513,9 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
-    std::vector<NegatedStatus> NegStatusVec;
+    BitVector IsNegatedVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -575,7 +555,7 @@ struct LoopInterchange {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
         bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
-                                        NegStatusVec, CostMap);
+                                        IsNegatedVec, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -590,8 +570,7 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-
-                   const std::vector<NegatedStatus> &NegStatusVec,
+                   BitVector &IsNegatedVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -605,7 +584,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, NegStatusVec, CostMap, CC)) {
+                          DependencyMatrix, IsNegatedVec, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1245,25 +1224,9 @@ static char flipDirection(char Dir) {
   }
 }
 
-/// Ensure that there are no negative direction dependencies corresponding to \p
-/// Dir.
-bool NegatedStatus::isNonNegativeDir(char Dir) const {
-  assert((Original || Negated) && "Cannot restore the original direction");
-
-  // If both flag is true, it means that there is both as-is and negated
-  // direction. In this case only `=` or `I` don't have negative direction
-  // dependency.
-  if (Original && Negated)
-    return Dir == '=' || Dir == 'I';
-
-  char Restored = Negated ? flipDirection(Dir) : Dir;
-  return Restored == '=' || Restored == 'I' || Restored == '<';
-}
-
 /// Return true if we can vectorize the loop specified by \p LoopId.
 static bool canVectorize(const CharMatrix &DepMatrix,
-                         const std::vector<NegatedStatus> &NegStatusVec,
-                         unsigned LoopId) {
+                         const BitVector &IsNegatedVec, unsigned LoopId) {
   // The loop can be vectorized if there are no negative dependencies. Consider
   // the dependency of `j` in the following example.
   //
@@ -1278,7 +1241,9 @@ static bool canVectorize(const CharMatrix &DepMatrix,
   // the vector width is less than or equal to 4 x sizeof(A[0][0]).
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (!NegStatusVec[I].isNonNegativeDir(Dir))
+    if (IsNegatedVec[I])
+      Dir = flipDirection(Dir);
+    if (Dir != '=' && Dir != 'I' && Dir != '<')
       return false;
   }
   return true;
@@ -1286,15 +1251,15 @@ static bool canVectorize(const CharMatrix &DepMatrix,
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
     unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const std::vector<NegatedStatus> &NegStatusVec) {
+    const BitVector &IsNegatedVec) {
   // If the outer loop cannot be vectorized, it is not profitable to move this
   // to inner position.
-  if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId))
+  if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId))
     return false;
 
   // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId))
+  if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1307,8 +1272,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const std::vector<NegatedStatus> &NegStatusVec,
+    unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1331,7 +1295,7 @@ bool LoopInterchangeProfitability::isProfitable(
       break;
     case RuleTy::ForVectorization:
       shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
-                                                       DepMatrix, NegStatusVec);
+                                                       DepMatrix, IsNegatedVec);
       break;
     }
 

>From cad4db91a1c86941a4eabf17a9accc0df3ec65f2 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 8 Apr 2025 14:58:57 +0000
Subject: [PATCH 3/7] Add test that has positive dependencies

---
 .../profitability-vectorization-heuristic.ll  | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 14c2046eebbb4..7108d3adf5d79 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -101,3 +101,59 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that the below loops are exchanged to allow innermost loop
+; vectorization. We cannot vectorize the j-loop because it has negative
+; distance dependency, but the i-loop can be vectorized.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 1; j < 256; j++) {
+;     A[i][j] = A[i][j-1] + B[i][j];
+;     C[i][j] += C[i+1][j];
+;   }
+; }
+;
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_necessary_for_vectorization2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+define void @interchange_necessary_for_vectorization2() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add nsw i64 %j, -1
+  %a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
+  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
+  %c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index, align 4
+  %b = load float, ptr %b.index, align 4
+  %c0 = load float, ptr %c.load.index, align 4
+  %c1 = load float, ptr %c.store.index, align 4
+  %add.0 = fadd float %a, %b
+  %a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index, align 4
+  %add.1 = fadd float %c0, %c1
+  store float %add.1, ptr %c.store.index, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}

>From 6a0a86839357ab9e46e7152cff9c246244625f12 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 6 Jun 2025 08:41:40 +0000
Subject: [PATCH 4/7] Add "lexically forward" flag for vectorization
 profitability check

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 145 ++++++++++--------
 .../profitability-vectorization-heuristic.ll  |  82 +++++++++-
 2 files changed, 160 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index da49da0bcc29d..63eef0eafb99d 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -120,15 +119,18 @@ static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
 
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
-    for (auto D : Row)
+    ArrayRef<char> RowRef(Row);
+
+    // Drop the last element because it is a flag indicating whether the row is
+    // "lexically forward", which doesn't affect the legality check.
+    for (auto D : RowRef.drop_back())
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
   }
 }
 #endif
 
-static bool populateDependencyMatrix(CharMatrix &DepMatrix,
-                                     BitVector &IsNegatedVec, unsigned Level,
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
                                      Loop *L, DependenceInfo *DI,
                                      ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
@@ -170,8 +172,19 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
   }
   ValueVector::iterator I, IE, J, JE;
 
-  // Manage all found direction vectors, negated and not negated, separately.
-  StringSet<> Seen[2];
+  // Manage direction vectors that are already seen. Map each direction vector
+  // to an index of DepMatrix at which it is stored.
+  StringMap<unsigned> Seen;
+
+  // The i-th element is set iff all dependencies corresponding to the i-th
+  // direction vector in DepMatrix are "lexically forward". The notion
+  // "lexically forward" aligns with what is defined in LAA
+  // (LoopAccessAnalysis).
+  //
+  // We deem a dependence lexically forward if we can prove that the
+  // destination instruction is always executed after the source instruction
+  // within each iteration.
+  BitVector IsForwardFlags;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -184,11 +197,22 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
       // Track Output, Flow, and Anti dependencies.
       if (auto D = DI->depends(Src, Dst)) {
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+        bool IsForward = true;
+
+        // If Src and Dst are in the same BB, Src is always executed before Dst
+        // in the same loop iteration. If not, we must check whether one BB
+        // dominates the other to determine if Src and Dst are executed in this
+        // order. At the moment, we don't perform such check.
+        if (Src->getParent() != Dst->getParent())
+          IsForward = false;
+
         // If the direction vector is negative, normalize it to
         // make it non-negative.
         bool Normalized = D->normalize(SE);
-        if (Normalized)
+        if (Normalized) {
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
+          IsForward = false;
+        }
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
                    dbgs() << "Found " << DepType
@@ -226,17 +250,28 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
           Dep.push_back('I');
         }
 
+        auto [Ite, Inserted] = Seen.try_emplace(
+            StringRef(Dep.data(), Dep.size()), DepMatrix.size());
+
         // Make sure we only add unique entries to the dependency matrix.
-        // Negated vectors (due to normalization) are treated as separate from
-        // non negated ones.
-        if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) {
+        if (Inserted) {
           DepMatrix.push_back(Dep);
-          IsNegatedVec.push_back(Normalized);
+          IsForwardFlags.push_back(true);
         }
+        if (!IsForward)
+          IsForwardFlags.reset(Ite->second);
       }
     }
   }
 
+  assert(DepMatrix.size() == IsForwardFlags.size() &&
+         "Dependency matrix and IsForwardVec should have the same size.");
+
+  // If all dependencies corresponding to a direction vector are forward, encode
+  // it to '<', otherwise to '*'.
+  for (unsigned I = 0; I != DepMatrix.size(); I++)
+    DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');
+
   return true;
 }
 
@@ -285,11 +320,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
       continue;
 
     // Check if the direction vector is lexicographically positive (or zero)
-    // for both before/after exchanged.
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    // for both before/after exchanged. Ignore the last element because it
+    // doesn't affect the legality.
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
   }
   return true;
@@ -429,7 +465,7 @@ class LoopInterchangeProfitability {
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
+                    CharMatrix &DepMatrix,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -439,10 +475,9 @@ class LoopInterchangeProfitability {
       const DenseMap<const Loop *, unsigned> &CostMap,
       std::unique_ptr<CacheCost> &CC);
   std::optional<bool> isProfitablePerInstrOrderCost();
-  std::optional<bool>
-  isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
-                               CharMatrix &DepMatrix,
-                               const BitVector &IsNegatedVec);
+  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+                                                   unsigned OuterLoopId,
+                                                   CharMatrix &DepMatrix);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -534,9 +569,8 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
-    BitVector IsNegatedVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -575,8 +609,8 @@ struct LoopInterchange {
     for (unsigned j = SelecLoopId; j > 0; j--) {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
-        bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
-                                        IsNegatedVec, CostMap);
+        bool Interchanged =
+            processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -591,7 +625,6 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-                   BitVector &IsNegatedVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -605,7 +638,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, IsNegatedVec, CostMap, CC)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1230,57 +1263,39 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   return std::nullopt;
 }
 
-static char flipDirection(char Dir) {
-  switch (Dir) {
-  case '<':
-    return '>';
-  case '>':
-    return '<';
-  case '=':
-  case 'I':
-  case '*':
-    return Dir;
-  default:
-    llvm_unreachable("Unknown direction");
-  }
-}
-
 /// Return true if we can vectorize the loop specified by \p LoopId.
-static bool canVectorize(const CharMatrix &DepMatrix,
-                         const BitVector &IsNegatedVec, unsigned LoopId) {
-  // The loop can be vectorized if there are no negative dependencies. Consider
-  // the dependency of `j` in the following example.
-  //
-  //   Positive: ... = A[i][j]       Negative: ... = A[i][j-1]
-  //             A[i][j-1] = ...               A[i][j] = ...
-  //
-  // In the right case, vectorizing the loop can change the loaded value from
-  // `A[i][j-1]`. At the moment we don't take into account the distance of the
-  // dependency and vector width.
-  // TODO: Considering the dependency distance and the vector width can give a
-  // more accurate result. For example, the following loop can be vectorized if
-  // the vector width is less than or equal to 4 x sizeof(A[0][0]).
+static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (IsNegatedVec[I])
-      Dir = flipDirection(Dir);
-    if (Dir != '=' && Dir != 'I' && Dir != '<')
-      return false;
+    char DepType = DepMatrix[I].back();
+    assert((DepType == '<' || DepType == '*') &&
+           "Unexpected element in dependency vector");
+
+    // There are no loop-carried dependencies.
+    if (Dir == '=' || Dir == 'I')
+      continue;
+
+    // If both Dir and DepType are '<', it means that the all dependencies are
+    // lexically forward. Such dependencies don't prevent vectorization.
+    if (Dir == '<' && DepType == '<')
+      continue;
+
+    // We cannot prove that the loop is vectorizable.
+    return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
-    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const BitVector &IsNegatedVec) {
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
   // If the outer loop cannot be vectorized, it is not profitable to move this
   // to inner position.
-  if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId))
+  if (!canVectorize(DepMatrix, OuterLoopId))
     return false;
 
   // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId))
+  if (!canVectorize(DepMatrix, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1293,7 +1308,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1315,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable(
       shouldInterchange = isProfitablePerInstrOrderCost();
       break;
     case RuleTy::ForVectorization:
-      shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
-                                                       DepMatrix, IsNegatedVec);
+      shouldInterchange =
+          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
       break;
     }
 
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 7108d3adf5d79..210675075bdc7 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,6 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
+ at D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
+ at E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -103,8 +105,9 @@ exit:
 }
 
 ; Check that the below loops are exchanged to allow innermost loop
-; vectorization. We cannot vectorize the j-loop because it has negative
-; distance dependency, but the i-loop can be vectorized.
+; vectorization. We cannot vectorize the j-loop because it has a lexically
+; backward dependency, but the i-loop can be vectorized because all the
+; loop-carried dependencies are lexically forward.
 ;
 ; for (int i = 0; i < 255; i++) {
 ;   for (int j = 1; j < 256; j++) {
@@ -157,3 +160,78 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that no interchange is performed for the following loop. The j-loop is
+; vectorizable because all the dependencies are lexically forward. However, at
+; the moment, we don't analyze an execution order between instructions in
+; different BBs, so fail to determine that the j-loop is vectorizable.
+; Therefore, no exchange is performed.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 0; j < 255; j++) {
+;     for (int k = 0; k < 128; k++) {
+;       E[i][j][k] = D[i+1][j+1][2*k];
+;       if (cond)
+;         D[i][j][k+1] += 1.0;
+;   }
+; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+define void @multiple_BBs_in_loop() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.header
+
+for.j.header:
+  %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
+  %j.inc = add nsw i64 %j, 1
+  br label %for.k.body
+
+for.k.body:
+  %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
+  %k.inc = add nsw i64 %k, 1
+  %k.2 = mul nsw i64 %k, 2
+  %d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index, align 4
+  store float %d.load, ptr %e.index, align 4
+  %cond = freeze i1 undef
+  br i1 %cond, label %if.then, label %for.k.inc
+
+if.then:
+  %d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
+  %d.load2 = load float, ptr %d.index2, align 4
+  %add = fadd float %d.load2, 1.0
+  store float %add, ptr %d.index2, align 4
+  br label %for.k.inc
+
+for.k.inc:
+  %cmp.k = icmp eq i64 %k.inc, 128
+  br i1 %cmp.k, label %for.j.inc, label %for.k.body
+
+for.j.inc:
+  %cmp.j = icmp eq i64 %j.inc, 255
+  br i1 %cmp.j, label %for.i.inc, label %for.j.header
+
+for.i.inc:
+  %cmp.i = icmp eq i64 %i.inc, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}

>From ced443bef010736414ce76c26f6cd6386e107574 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 24 Jul 2025 09:58:24 +0900
Subject: [PATCH 5/7] Fix comments

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 3452c4ee8f4f8..76c2870dafbee 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -70,6 +70,13 @@ namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
 
+/// A list of direction vectors. Each entry represents a direction vector
+/// corresponding to one or more dependencies existing in the loop nest. The
+/// length of all direction vectors is equal and is N + 1, where N is the depth
+/// of the loop nest. The first N elements correspond to the dependency
+/// direction of each N loops. The last one indicates whether this entry is
+/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
+/// is defined in LoopAccessAnalysis.
 // TODO: Check if we can use a sparse matrix here.
 using CharMatrix = std::vector<std::vector<char>>;
 
@@ -128,8 +135,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
     ArrayRef<char> RowRef(Row);
 
-    // Drop the last element because it is a flag indicating whether the row is
-    // "lexically forward", which doesn't affect the legality check.
+    // Drop the last element because it is a flag indicating whether this is
+    // forward dependency or not, which doesn't affect the legality check.
     for (auto D : RowRef.drop_back())
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
@@ -1387,8 +1394,9 @@ static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
     if (Dir == '=' || Dir == 'I')
       continue;
 
-    // If both Dir and DepType are '<', it means that the all dependencies are
-    // lexically forward. Such dependencies don't prevent vectorization.
+    // DepType being '<' means that this direction vector represents a forward
+    // dependency. In principle, a loop with '<' direction can be vectorized in
+    // this case.
     if (Dir == '<' && DepType == '<')
       continue;
 

>From 211be9ec9d71af55e6ad396107c7b493c0453968 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 24 Jul 2025 09:59:11 +0900
Subject: [PATCH 6/7] Modify forward dependency check

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 91 +++++++++++--------
 1 file changed, 55 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 76c2870dafbee..c250f1c4ad458 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -142,6 +142,23 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
     LLVM_DEBUG(dbgs() << "\n");
   }
 }
+
+static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && Src != Dst &&
+         "Expected Src and Dst to be different instructions in the same BB");
+
+  bool FoundSrc = false;
+  for (const Instruction &I : *(Src->getParent())) {
+    if (&I == Src) {
+      FoundSrc = true;
+      continue;
+    }
+    if (&I == Dst)
+      return FoundSrc;
+  }
+
+  llvm_unreachable("Dst not found");
+}
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
@@ -190,16 +207,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
   // to an index of DepMatrix at which it is stored.
   StringMap<unsigned> Seen;
 
-  // The i-th element is set iff all dependencies corresponding to the i-th
-  // direction vector in DepMatrix are "lexically forward". The notion
-  // "lexically forward" aligns with what is defined in LAA
-  // (LoopAccessAnalysis).
-  //
-  // We deem a dependence lexically forward if we can prove that the
-  // destination instruction is always executed after the source instruction
-  // within each iteration.
-  BitVector IsForwardFlags;
-
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
       std::vector<char> Dep;
@@ -211,22 +218,11 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
       // Track Output, Flow, and Anti dependencies.
       if (auto D = DI->depends(Src, Dst)) {
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-        bool IsForward = true;
-
-        // If Src and Dst are in the same BB, Src is always executed before Dst
-        // in the same loop iteration. If not, we must check whether one BB
-        // dominates the other to determine if Src and Dst are executed in this
-        // order. At the moment, we don't perform such check.
-        if (Src->getParent() != Dst->getParent())
-          IsForward = false;
 
         // If the direction vector is negative, normalize it to
         // make it non-negative.
-        bool Normalized = D->normalize(SE);
-        if (Normalized) {
+        if (D->normalize(SE))
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
-          IsForward = false;
-        }
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
                    dbgs() << "Found " << DepType
@@ -264,28 +260,51 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
           Dep.push_back('I');
         }
 
+        // Test whether the dependency is forward or not.
+        bool IsKnownForward = true;
+        if (Src->getParent() != Dst->getParent()) {
+          // In general, when Src and Dst are in different BBs, the execution
+          // order of them within a single iteration is not guaranteed. Treat
+          // conservatively as not-forward dependency in this case.
+          IsKnownForward = false;
+        } else {
+          // Src and Dst are in the same BB. If they are the different
+          // instructions, Src should appear before Dst in the BB as they are
+          // stored to MemInstr in that order.
+          assert((Src == Dst || inThisOrder(Src, Dst)) &&
+                 "Unexpected instructions");
+
+          // If the Dependence object is reversed (due to normalization), it
+          // represents the dependency from Dst to Src, meaning it is a backward
+          // dependency. Otherwise it should be a forward dependency.
+          bool IsReversed = D->getSrc() != Src;
+          if (IsReversed)
+            IsKnownForward = false;
+        }
+
+        // Initialize the last element.
+        Dep.push_back('<');
+
+        // The last element should express the "summary" among one or more
+        // direction vectors whose first N elements are the same (where N is
+        // the depth of the loop nest). Hence we exclude the last element from
+        // the Seen map.
         auto [Ite, Inserted] = Seen.try_emplace(
-            StringRef(Dep.data(), Dep.size()), DepMatrix.size());
+            StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
 
         // Make sure we only add unique entries to the dependency matrix.
-        if (Inserted) {
+        if (Inserted)
           DepMatrix.push_back(Dep);
-          IsForwardFlags.push_back(true);
-        }
-        if (!IsForward)
-          IsForwardFlags.reset(Ite->second);
+
+        // If we cannot prove that this dependency is forward, change the last
+        // element of the corresponding entry. Note that the existing entry in
+        // DepMatrix can be modified.
+        if (!IsKnownForward)
+          DepMatrix[Ite->second].back() = '*';
       }
     }
   }
 
-  assert(DepMatrix.size() == IsForwardFlags.size() &&
-         "Dependency matrix and IsForwardVec should have the same size.");
-
-  // If all dependencies corresponding to a direction vector are forward, encode
-  // it to '<', otherwise to '*'.
-  for (unsigned I = 0; I != DepMatrix.size(); I++)
-    DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');
-
   return true;
 }
 

>From c21efda93562d419bec3e40e27901b5b71f1dc17 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 24 Jul 2025 09:59:22 +0900
Subject: [PATCH 7/7] Revise tests

---
 .../profitability-vectorization-heuristic.ll  | 71 +++++++++----------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 827bee7607a9f..4194849784054 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,8 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
- at D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
- at E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
+ at D = global [256 x [256 x [256 x float]]] zeroinitializer
+ at E = global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -107,7 +107,8 @@ exit:
 ; Check that the below loops are exchanged to allow innermost loop
 ; vectorization. We cannot vectorize the j-loop because it has a lexically
 ; backward dependency, but the i-loop can be vectorized because all the
-; loop-carried dependencies are lexically forward.
+; loop-carried dependencies are lexically forward. LoopVectorize currently only
+; vectorizes innermost loop, hence move the i-loop to that position.
 ;
 ; for (int i = 0; i < 255; i++) {
 ;   for (int j = 1; j < 256; j++) {
@@ -129,31 +130,31 @@ entry:
 
 for.i.header:
   %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
-  %i.inc = add nsw i64 %i, 1
+  %i.inc = add i64 %i, 1
   br label %for.j.body
 
 for.j.body:
   %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
-  %j.dec = add nsw i64 %j, -1
-  %a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
-  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
-  %c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
-  %c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
-  %a = load float, ptr %a.load.index, align 4
-  %b = load float, ptr %b.index, align 4
-  %c0 = load float, ptr %c.load.index, align 4
-  %c1 = load float, ptr %c.store.index, align 4
+  %j.dec = add i64 %j, -1
+  %a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec
+  %b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j
+  %c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index
+  %b = load float, ptr %b.index
+  %c0 = load float, ptr %c.load.index
+  %c1 = load float, ptr %c.store.index
   %add.0 = fadd float %a, %b
-  %a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
-  store float %add.0, ptr %a.store.index, align 4
+  %a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index
   %add.1 = fadd float %c0, %c1
-  store float %add.1, ptr %c.store.index, align 4
-  %j.next = add nuw nsw i64 %j, 1
+  store float %add.1, ptr %c.store.index
+  %j.next = add i64 %j, 1
   %cmp.j = icmp eq i64 %j.next, 256
   br i1 %cmp.j, label %for.i.inc, label %for.j.body
 
 for.i.inc:
-  %i.next = add nuw nsw i64 %i, 1
+  %i.next = add i64 %i, 1
   %cmp.i = icmp eq i64 %i.next, 255
   br i1 %cmp.i, label %exit, label %for.i.header
 
@@ -161,18 +162,18 @@ exit:
   ret void
 }
 
-; Check that no interchange is performed for the following loop. The j-loop is
-; vectorizable because all the dependencies are lexically forward. However, at
-; the moment, we don't analyze an execution order between instructions in
-; different BBs, so fail to determine that the j-loop is vectorizable.
-; Therefore, no exchange is performed.
+; Check that no interchange is performed for the following loop. Interchanging
+; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
+; has only forward dependencies. However, at the moment, a loop body consisting
+; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
+; the innermost place.
 ;
 ; for (int i = 0; i < 255; i++) {
 ;   for (int j = 0; j < 255; j++) {
 ;     for (int k = 0; k < 128; k++) {
 ;       E[i][j][k] = D[i+1][j+1][2*k];
 ;       if (cond)
-;         D[i][j][k+1] += 1.0;
+;         D[i][j][k+1] = 1.0;
 ;   }
 ; }
 
@@ -194,30 +195,28 @@ entry:
 
 for.i.header:
   %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
-  %i.inc = add nsw i64 %i, 1
+  %i.inc = add i64 %i, 1
   br label %for.j.header
 
 for.j.header:
   %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
-  %j.inc = add nsw i64 %j, 1
+  %j.inc = add i64 %j, 1
   br label %for.k.body
 
 for.k.body:
   %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
-  %k.inc = add nsw i64 %k, 1
-  %k.2 = mul nsw i64 %k, 2
-  %d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
-  %e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
-  %d.load = load float, ptr %d.index, align 4
-  store float %d.load, ptr %e.index, align 4
+  %k.inc = add i64 %k, 1
+  %k.2 = mul i64 %k, 2
+  %d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index
+  store float %d.load, ptr %e.index
   %cond = freeze i1 undef
   br i1 %cond, label %if.then, label %for.k.inc
 
 if.then:
-  %d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
-  %d.load2 = load float, ptr %d.index2, align 4
-  %add = fadd float %d.load2, 1.0
-  store float %add, ptr %d.index2, align 4
+  %d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc
+  store float 1.0, ptr %d.index2
   br label %for.k.inc
 
 for.k.inc:



More information about the llvm-commits mailing list