[llvm] [LoopInterchange] Improve profitability check for vectorization (PR #133672)

Ryotaro Kasuga via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 6 04:23:47 PDT 2025


https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/133672

>From b1c4248c4a242698feedb6d69d61d07dbcca407c Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 27 Mar 2025 10:45:26 +0000
Subject: [PATCH 1/4] [LoopInterchange] Improve profitability check for
 vectorization

The vectorization profitability has a process to check whether a given
loop can be vectorized or not. Since the process is conservative, a loop
that can be vectorized may be deemed not to be possible. This can
trigger unnecessary exchanges.
This patch improves the profitability decision by mitigating such
misjudgments. Before this patch, we considered a loop to be vectorizable
only when there are no loop carried dependencies with the IV of the
loop. However, a loop carried dependency doesn't prevent vectorization
if the distance is positive. This patch makes the vectorization check
more accurate by allowing a loop with the positive dependency. Note that
it is difficult to make a complete decision whether a loop can be
vectorized or not. To achieve this, we must check the vector width and
the distance of dependency.
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 128 ++++++++++++++----
 .../profitability-vectorization-heuristic.ll  |   8 +-
 2 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1dccba4cfa7b8..078da53c52b52 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -80,6 +80,21 @@ enum class RuleTy {
   ForVectorization,
 };
 
+/// Store the information about if corresponding direction vector was negated
+/// by normalization or not. This is necessary to restore the original one from
+/// a row of a dependency matrix, because we only manage normalized direction
+/// vectors and duplicate vectors are eliminated. So there may be both original
+/// and negated vectors for a single entry (a row of dependency matrix). E.g.,
+/// if there are two direction vectors `[< =]` and `[> =]`, the later one will
+/// be converted to the same as former one by normalization, so only `[< =]`
+/// would be retained in the final result.
+struct NegatedStatus {
+  bool Original = false;
+  bool Negated = false;
+
+  bool isNonNegativeDir(char Dir) const;
+};
+
 } // end anonymous namespace
 
 // Minimum loop depth supported.
@@ -126,9 +141,10 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
 }
 #endif
 
-static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
-                                     Loop *L, DependenceInfo *DI,
-                                     ScalarEvolution *SE,
+static bool populateDependencyMatrix(CharMatrix &DepMatrix,
+                                     std::vector<NegatedStatus> &NegStatusVec,
+                                     unsigned Level, Loop *L,
+                                     DependenceInfo *DI, ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
   using ValueVector = SmallVector<Value *, 16>;
 
@@ -167,7 +183,9 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     return false;
   }
   ValueVector::iterator I, IE, J, JE;
-  StringSet<> Seen;
+
+  // Manage all found direction vectors. and map it to the index of DepMatrix.
+  StringMap<unsigned> Seen;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -182,7 +200,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
         // If the direction vector is negative, normalize it to
         // make it non-negative.
-        if (D->normalize(SE))
+        bool Normalized = D->normalize(SE);
+        if (Normalized)
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
@@ -214,8 +233,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         }
 
         // Make sure we only add unique entries to the dependency matrix.
-        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+        unsigned Index = DepMatrix.size();
+        auto [Ite, Inserted] =
+            Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index);
+        if (Inserted) {
           DepMatrix.push_back(Dep);
+          NegStatusVec.push_back(NegatedStatus{});
+        } else
+          Index = Ite->second;
+
+        NegatedStatus &Status = NegStatusVec[Index];
+        (Normalized ? Status.Negated : Status.Original) = true;
       }
     }
   }
@@ -400,6 +428,7 @@ class LoopInterchangeProfitability {
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
+                    const std::vector<NegatedStatus> &NegStatusVec,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -409,9 +438,10 @@ class LoopInterchangeProfitability {
       const DenseMap<const Loop *, unsigned> &CostMap,
       std::unique_ptr<CacheCost> &CC);
   std::optional<bool> isProfitablePerInstrOrderCost();
-  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
-                                                   unsigned OuterLoopId,
-                                                   CharMatrix &DepMatrix);
+  std::optional<bool>
+  isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
+                               CharMatrix &DepMatrix,
+                               const std::vector<NegatedStatus> &NegStatusVec);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -503,8 +533,9 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
+    std::vector<NegatedStatus> NegStatusVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -543,8 +574,8 @@ struct LoopInterchange {
     for (unsigned j = SelecLoopId; j > 0; j--) {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
-        bool Interchanged =
-            processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
+        bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
+                                        NegStatusVec, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -559,6 +590,8 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
+
+                   const std::vector<NegatedStatus> &NegStatusVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -572,7 +605,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap, CC)) {
+                          DependencyMatrix, NegStatusVec, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1197,27 +1230,71 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   return std::nullopt;
 }
 
+static char flipDirection(char Dir) {
+  switch (Dir) {
+  case '<':
+    return '>';
+  case '>':
+    return '<';
+  case '=':
+  case 'I':
+  case '*':
+    return Dir;
+  default:
+    llvm_unreachable("Unknown direction");
+  }
+}
+
+/// Ensure that there are no negative direction dependencies corresponding to \p
+/// Dir.
+bool NegatedStatus::isNonNegativeDir(char Dir) const {
+  assert((Original || Negated) && "Cannot restore the original direction");
+
+  // If both flag is true, it means that there is both as-is and negated
+  // direction. In this case only `=` or `I` don't have negative direction
+  // dependency.
+  if (Original && Negated)
+    return Dir == '=' || Dir == 'I';
+
+  char Restored = Negated ? flipDirection(Dir) : Dir;
+  return Restored == '=' || Restored == 'I' || Restored == '<';
+}
+
 /// Return true if we can vectorize the loop specified by \p LoopId.
-static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
+static bool canVectorize(const CharMatrix &DepMatrix,
+                         const std::vector<NegatedStatus> &NegStatusVec,
+                         unsigned LoopId) {
+  // The loop can be vectorized if there are no negative dependencies. Consider
+  // the dependency of `j` in the following example.
+  //
+  //   Positive: ... = A[i][j]       Negative: ... = A[i][j-1]
+  //             A[i][j-1] = ...               A[i][j] = ...
+  //
+  // In the right case, vectorizing the loop can change the loaded value from
+  // `A[i][j-1]`. At the moment we don't take into account the distance of the
+  // dependency and vector width.
+  // TODO: Considering the dependency distance and the vector width can give a
+  // more accurate result. For example, the following loop can be vectorized if
+  // the vector width is less than or equal to 4 x sizeof(A[0][0]).
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (Dir != 'I' && Dir != '=')
+    if (!NegStatusVec[I].isNonNegativeDir(Dir))
       return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
-    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
-  // If the outer loop is not loop independent it is not profitable to move
-  // this to inner position, since doing so would not enable inner loop
-  // parallelism.
-  if (!canVectorize(DepMatrix, OuterLoopId))
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const std::vector<NegatedStatus> &NegStatusVec) {
+  // If the outer loop cannot be vectorized, it is not profitable to move this
+  // to inner position.
+  if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId))
     return false;
 
-  // If inner loop has dependence and outer loop is loop independent then it is
+  // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, InnerLoopId))
+  if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1231,6 +1308,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
     unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const std::vector<NegatedStatus> &NegStatusVec,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1252,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable(
       shouldInterchange = isProfitablePerInstrOrderCost();
       break;
     case RuleTy::ForVectorization:
-      shouldInterchange =
-          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+      shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
+                                                       DepMatrix, NegStatusVec);
       break;
     }
 
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 0f5aee582373d..14c2046eebbb4 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -64,15 +64,13 @@ exit:
 ;   for (int j = 1; j < 256; j++)
 ;     A[i][j-1] = A[i][j] + B[i][j];
 ;
-; FIXME: These loops are exchanged at this time due to the problem in
-; profitability heuristic calculation for vectorization.
 
-; CHECK:      --- !Passed
+; CHECK:      --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_unnecesasry_for_vectorization
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:   - String:          Insufficient information to calculate the cost of loop for interchange.
 define void @interchange_unnecesasry_for_vectorization() {
 entry:
   br label %for.i.header

>From 8f4f814b01d2ad5cab1962513adc8bf7deeec012 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 3 Apr 2025 09:55:13 +0000
Subject: [PATCH 2/4] Handle negated and non negated direction vectors
 separately.

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 90 ++++++-------------
 1 file changed, 27 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 078da53c52b52..fe33ee33258f1 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -80,21 +81,6 @@ enum class RuleTy {
   ForVectorization,
 };
 
-/// Store the information about if corresponding direction vector was negated
-/// by normalization or not. This is necessary to restore the original one from
-/// a row of a dependency matrix, because we only manage normalized direction
-/// vectors and duplicate vectors are eliminated. So there may be both original
-/// and negated vectors for a single entry (a row of dependency matrix). E.g.,
-/// if there are two direction vectors `[< =]` and `[> =]`, the later one will
-/// be converted to the same as former one by normalization, so only `[< =]`
-/// would be retained in the final result.
-struct NegatedStatus {
-  bool Original = false;
-  bool Negated = false;
-
-  bool isNonNegativeDir(char Dir) const;
-};
-
 } // end anonymous namespace
 
 // Minimum loop depth supported.
@@ -142,9 +128,9 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix,
-                                     std::vector<NegatedStatus> &NegStatusVec,
-                                     unsigned Level, Loop *L,
-                                     DependenceInfo *DI, ScalarEvolution *SE,
+                                     BitVector &IsNegatedVec, unsigned Level,
+                                     Loop *L, DependenceInfo *DI,
+                                     ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
   using ValueVector = SmallVector<Value *, 16>;
 
@@ -184,8 +170,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
   }
   ValueVector::iterator I, IE, J, JE;
 
-  // Manage all found direction vectors. and map it to the index of DepMatrix.
-  StringMap<unsigned> Seen;
+  // Manage all found direction vectors, negated and not negated, separately.
+  StringSet<> Seen[2];
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -233,17 +219,12 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
         }
 
         // Make sure we only add unique entries to the dependency matrix.
-        unsigned Index = DepMatrix.size();
-        auto [Ite, Inserted] =
-            Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index);
-        if (Inserted) {
+        // Negated vectors (due to normalization) are treated as separate from
+        // non negated ones.
+        if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) {
           DepMatrix.push_back(Dep);
-          NegStatusVec.push_back(NegatedStatus{});
-        } else
-          Index = Ite->second;
-
-        NegatedStatus &Status = NegStatusVec[Index];
-        (Normalized ? Status.Negated : Status.Original) = true;
+          IsNegatedVec.push_back(Normalized);
+        }
       }
     }
   }
@@ -427,8 +408,7 @@ class LoopInterchangeProfitability {
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix,
-                    const std::vector<NegatedStatus> &NegStatusVec,
+                    CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -441,7 +421,7 @@ class LoopInterchangeProfitability {
   std::optional<bool>
   isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
                                CharMatrix &DepMatrix,
-                               const std::vector<NegatedStatus> &NegStatusVec);
+                               const BitVector &IsNegatedVec);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -533,9 +513,9 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
-    std::vector<NegatedStatus> NegStatusVec;
+    BitVector IsNegatedVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -575,7 +555,7 @@ struct LoopInterchange {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
         bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
-                                        NegStatusVec, CostMap);
+                                        IsNegatedVec, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -590,8 +570,7 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-
-                   const std::vector<NegatedStatus> &NegStatusVec,
+                   BitVector &IsNegatedVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -605,7 +584,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, NegStatusVec, CostMap, CC)) {
+                          DependencyMatrix, IsNegatedVec, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1245,25 +1224,9 @@ static char flipDirection(char Dir) {
   }
 }
 
-/// Ensure that there are no negative direction dependencies corresponding to \p
-/// Dir.
-bool NegatedStatus::isNonNegativeDir(char Dir) const {
-  assert((Original || Negated) && "Cannot restore the original direction");
-
-  // If both flag is true, it means that there is both as-is and negated
-  // direction. In this case only `=` or `I` don't have negative direction
-  // dependency.
-  if (Original && Negated)
-    return Dir == '=' || Dir == 'I';
-
-  char Restored = Negated ? flipDirection(Dir) : Dir;
-  return Restored == '=' || Restored == 'I' || Restored == '<';
-}
-
 /// Return true if we can vectorize the loop specified by \p LoopId.
 static bool canVectorize(const CharMatrix &DepMatrix,
-                         const std::vector<NegatedStatus> &NegStatusVec,
-                         unsigned LoopId) {
+                         const BitVector &IsNegatedVec, unsigned LoopId) {
   // The loop can be vectorized if there are no negative dependencies. Consider
   // the dependency of `j` in the following example.
   //
@@ -1278,7 +1241,9 @@ static bool canVectorize(const CharMatrix &DepMatrix,
   // the vector width is less than or equal to 4 x sizeof(A[0][0]).
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (!NegStatusVec[I].isNonNegativeDir(Dir))
+    if (IsNegatedVec[I])
+      Dir = flipDirection(Dir);
+    if (Dir != '=' && Dir != 'I' && Dir != '<')
       return false;
   }
   return true;
@@ -1286,15 +1251,15 @@ static bool canVectorize(const CharMatrix &DepMatrix,
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
     unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const std::vector<NegatedStatus> &NegStatusVec) {
+    const BitVector &IsNegatedVec) {
   // If the outer loop cannot be vectorized, it is not profitable to move this
   // to inner position.
-  if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId))
+  if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId))
     return false;
 
   // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId))
+  if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1307,8 +1272,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const std::vector<NegatedStatus> &NegStatusVec,
+    unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1331,7 +1295,7 @@ bool LoopInterchangeProfitability::isProfitable(
       break;
     case RuleTy::ForVectorization:
       shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
-                                                       DepMatrix, NegStatusVec);
+                                                       DepMatrix, IsNegatedVec);
       break;
     }
 

>From cad4db91a1c86941a4eabf17a9accc0df3ec65f2 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 8 Apr 2025 14:58:57 +0000
Subject: [PATCH 3/4] Add test that has positive dependencies

---
 .../profitability-vectorization-heuristic.ll  | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 14c2046eebbb4..7108d3adf5d79 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -101,3 +101,59 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that the below loops are exchanged to allow innermost loop
+; vectorization. We cannot vectorize the j-loop because it has negative
+; distance dependency, but the i-loop can be vectorized.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 1; j < 256; j++) {
+;     A[i][j] = A[i][j-1] + B[i][j];
+;     C[i][j] += C[i+1][j];
+;   }
+; }
+;
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_necessary_for_vectorization2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+define void @interchange_necessary_for_vectorization2() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add nsw i64 %j, -1
+  %a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
+  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
+  %c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index, align 4
+  %b = load float, ptr %b.index, align 4
+  %c0 = load float, ptr %c.load.index, align 4
+  %c1 = load float, ptr %c.store.index, align 4
+  %add.0 = fadd float %a, %b
+  %a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index, align 4
+  %add.1 = fadd float %c0, %c1
+  store float %add.1, ptr %c.store.index, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}

>From 6a0a86839357ab9e46e7152cff9c246244625f12 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 6 Jun 2025 08:41:40 +0000
Subject: [PATCH 4/4] Add "lexically forward" flag for vectorization
 profitability check

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 145 ++++++++++--------
 .../profitability-vectorization-heuristic.ll  |  82 +++++++++-
 2 files changed, 160 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index da49da0bcc29d..63eef0eafb99d 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -120,15 +119,18 @@ static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
 
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
-    for (auto D : Row)
+    ArrayRef<char> RowRef(Row);
+
+    // Drop the last element because it is a flag indicating whether the row is
+    // "lexically forward", which doesn't affect the legality check.
+    for (auto D : RowRef.drop_back())
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
   }
 }
 #endif
 
-static bool populateDependencyMatrix(CharMatrix &DepMatrix,
-                                     BitVector &IsNegatedVec, unsigned Level,
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
                                      Loop *L, DependenceInfo *DI,
                                      ScalarEvolution *SE,
                                      OptimizationRemarkEmitter *ORE) {
@@ -170,8 +172,19 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
   }
   ValueVector::iterator I, IE, J, JE;
 
-  // Manage all found direction vectors, negated and not negated, separately.
-  StringSet<> Seen[2];
+  // Manage direction vectors that are already seen. Map each direction vector
+  // to an index of DepMatrix at which it is stored.
+  StringMap<unsigned> Seen;
+
+  // The i-th element is set iff all dependencies corresponding to the i-th
+  // direction vector in DepMatrix are "lexically forward". The notion
+  // "lexically forward" aligns with what is defined in LAA
+  // (LoopAccessAnalysis).
+  //
+  // We deem a dependence lexically forward if we can prove that the
+  // destination instruction is always executed after the source instruction
+  // within each iteration.
+  BitVector IsForwardFlags;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -184,11 +197,22 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
       // Track Output, Flow, and Anti dependencies.
       if (auto D = DI->depends(Src, Dst)) {
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+        bool IsForward = true;
+
+        // If Src and Dst are in the same BB, Src is always executed before Dst
+        // in the same loop iteration. If not, we must check whether one BB
+        // dominates the other to determine if Src and Dst are executed in this
+        // order. At the moment, we don't perform such check.
+        if (Src->getParent() != Dst->getParent())
+          IsForward = false;
+
         // If the direction vector is negative, normalize it to
         // make it non-negative.
         bool Normalized = D->normalize(SE);
-        if (Normalized)
+        if (Normalized) {
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
+          IsForward = false;
+        }
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
                    dbgs() << "Found " << DepType
@@ -226,17 +250,28 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix,
           Dep.push_back('I');
         }
 
+        auto [Ite, Inserted] = Seen.try_emplace(
+            StringRef(Dep.data(), Dep.size()), DepMatrix.size());
+
         // Make sure we only add unique entries to the dependency matrix.
-        // Negated vectors (due to normalization) are treated as separate from
-        // non negated ones.
-        if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) {
+        if (Inserted) {
           DepMatrix.push_back(Dep);
-          IsNegatedVec.push_back(Normalized);
+          IsForwardFlags.push_back(true);
         }
+        if (!IsForward)
+          IsForwardFlags.reset(Ite->second);
       }
     }
   }
 
+  assert(DepMatrix.size() == IsForwardFlags.size() &&
+         "Dependency matrix and IsForwardVec should have the same size.");
+
+  // If all dependencies corresponding to a direction vector are forward, encode
+  // it to '<', otherwise to '*'.
+  for (unsigned I = 0; I != DepMatrix.size(); I++)
+    DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');
+
   return true;
 }
 
@@ -285,11 +320,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
       continue;
 
     // Check if the direction vector is lexicographically positive (or zero)
-    // for both before/after exchanged.
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    // for both before/after exchanged. Ignore the last element because it
+    // doesn't affect the legality.
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
   }
   return true;
@@ -429,7 +465,7 @@ class LoopInterchangeProfitability {
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
+                    CharMatrix &DepMatrix,
                     const DenseMap<const Loop *, unsigned> &CostMap,
                     std::unique_ptr<CacheCost> &CC);
 
@@ -439,10 +475,9 @@ class LoopInterchangeProfitability {
       const DenseMap<const Loop *, unsigned> &CostMap,
       std::unique_ptr<CacheCost> &CC);
   std::optional<bool> isProfitablePerInstrOrderCost();
-  std::optional<bool>
-  isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
-                               CharMatrix &DepMatrix,
-                               const BitVector &IsNegatedVec);
+  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+                                                   unsigned OuterLoopId,
+                                                   CharMatrix &DepMatrix);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -534,9 +569,8 @@ struct LoopInterchange {
                       << "\n");
 
     CharMatrix DependencyMatrix;
-    BitVector IsNegatedVec;
     Loop *OuterMostLoop = *(LoopList.begin());
-    if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth,
+    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
                                   OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
@@ -575,8 +609,8 @@ struct LoopInterchange {
     for (unsigned j = SelecLoopId; j > 0; j--) {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
-        bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
-                                        IsNegatedVec, CostMap);
+        bool Interchanged =
+            processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -591,7 +625,6 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-                   BitVector &IsNegatedVec,
                    const DenseMap<const Loop *, unsigned> &CostMap) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -605,7 +638,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, IsNegatedVec, CostMap, CC)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1230,57 +1263,39 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   return std::nullopt;
 }
 
-static char flipDirection(char Dir) {
-  switch (Dir) {
-  case '<':
-    return '>';
-  case '>':
-    return '<';
-  case '=':
-  case 'I':
-  case '*':
-    return Dir;
-  default:
-    llvm_unreachable("Unknown direction");
-  }
-}
-
 /// Return true if we can vectorize the loop specified by \p LoopId.
-static bool canVectorize(const CharMatrix &DepMatrix,
-                         const BitVector &IsNegatedVec, unsigned LoopId) {
-  // The loop can be vectorized if there are no negative dependencies. Consider
-  // the dependency of `j` in the following example.
-  //
-  //   Positive: ... = A[i][j]       Negative: ... = A[i][j-1]
-  //             A[i][j-1] = ...               A[i][j] = ...
-  //
-  // In the right case, vectorizing the loop can change the loaded value from
-  // `A[i][j-1]`. At the moment we don't take into account the distance of the
-  // dependency and vector width.
-  // TODO: Considering the dependency distance and the vector width can give a
-  // more accurate result. For example, the following loop can be vectorized if
-  // the vector width is less than or equal to 4 x sizeof(A[0][0]).
+static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (IsNegatedVec[I])
-      Dir = flipDirection(Dir);
-    if (Dir != '=' && Dir != 'I' && Dir != '<')
-      return false;
+    char DepType = DepMatrix[I].back();
+    assert((DepType == '<' || DepType == '*') &&
+           "Unexpected element in dependency vector");
+
+    // There are no loop-carried dependencies.
+    if (Dir == '=' || Dir == 'I')
+      continue;
+
+    // If both Dir and DepType are '<', it means that the all dependencies are
+    // lexically forward. Such dependencies don't prevent vectorization.
+    if (Dir == '<' && DepType == '<')
+      continue;
+
+    // We cannot prove that the loop is vectorizable.
+    return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
-    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const BitVector &IsNegatedVec) {
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
   // If the outer loop cannot be vectorized, it is not profitable to move this
   // to inner position.
-  if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId))
+  if (!canVectorize(DepMatrix, OuterLoopId))
     return false;
 
   // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
-  if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId))
+  if (!canVectorize(DepMatrix, InnerLoopId))
     return true;
 
   // If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1293,7 +1308,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
   // isProfitable() is structured to avoid endless loop interchange. If the
@@ -1315,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable(
       shouldInterchange = isProfitablePerInstrOrderCost();
       break;
     case RuleTy::ForVectorization:
-      shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
-                                                       DepMatrix, IsNegatedVec);
+      shouldInterchange =
+          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
       break;
     }
 
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 7108d3adf5d79..210675075bdc7 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,6 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
+ at D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
+ at E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -103,8 +105,9 @@ exit:
 }
 
 ; Check that the below loops are exchanged to allow innermost loop
-; vectorization. We cannot vectorize the j-loop because it has negative
-; distance dependency, but the i-loop can be vectorized.
+; vectorization. We cannot vectorize the j-loop because it has a lexically
+; backward dependency, but the i-loop can be vectorized because all the
+; loop-carried dependencies are lexically forward.
 ;
 ; for (int i = 0; i < 255; i++) {
 ;   for (int j = 1; j < 256; j++) {
@@ -157,3 +160,78 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that no interchange is performed for the following loop. The j-loop is
+; vectorizable because all the dependencies are lexically forward. However, at
+; the moment, we don't analyze an execution order between instructions in
+; different BBs, so fail to determine that the j-loop is vectorizable.
+; Therefore, no exchange is performed.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 0; j < 255; j++) {
+;     for (int k = 0; k < 128; k++) {
+;       E[i][j][k] = D[i+1][j+1][2*k];
+;       if (cond)
+;         D[i][j][k+1] += 1.0;
+;   }
+; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+define void @multiple_BBs_in_loop() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.header
+
+for.j.header:
+  %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
+  %j.inc = add nsw i64 %j, 1
+  br label %for.k.body
+
+for.k.body:
+  %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
+  %k.inc = add nsw i64 %k, 1
+  %k.2 = mul nsw i64 %k, 2
+  %d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index, align 4
+  store float %d.load, ptr %e.index, align 4
+  %cond = freeze i1 undef
+  br i1 %cond, label %if.then, label %for.k.inc
+
+if.then:
+  %d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
+  %d.load2 = load float, ptr %d.index2, align 4
+  %add = fadd float %d.load2, 1.0
+  store float %add, ptr %d.index2, align 4
+  br label %for.k.inc
+
+for.k.inc:
+  %cmp.k = icmp eq i64 %k.inc, 128
+  br i1 %cmp.k, label %for.j.inc, label %for.k.body
+
+for.j.inc:
+  %cmp.j = icmp eq i64 %j.inc, 255
+  br i1 %cmp.j, label %for.i.inc, label %for.j.header
+
+for.i.inc:
+  %cmp.i = icmp eq i64 %i.inc, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}



More information about the llvm-commits mailing list