[llvm] [LoopInterchange] Prevent from undoing its own transformation (PR #127473)

Wed Feb 19 22:42:30 PST 2025

https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/127473

>From 45a837364233b658ea7fc8820c442e49ac9cfcf5 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 7 Feb 2025 11:02:37 +0000
Subject: [PATCH 1/2] [LoopInterchange] Stop performing unprofitable
 interchange

LoopInterchange uses the bubble-sort fashion algorithm to sort the
loops, but the comparison function (called isProfitable) didn't satisfy
asymmetry. This means that both isProfitable(a, b) and isProfitable(b,
a) can return true, triggering an unprofitable interchange. This patch
fixes the problem and prevents the interchange from performing
unprofitable transformations.
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 138 ++++++++++++------
 .../profitability-redundant-interchange.ll    |  80 ++++++++++
 2 files changed, 174 insertions(+), 44 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index f45d90ff13e14..056b69c4e2104 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -356,26 +356,25 @@ class LoopInterchangeLegality {
   SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
+using CostMapTy = DenseMap<const Loop *, std::pair<unsigned, CacheCostTy>>;
+
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
 /// loop.
 class LoopInterchangeProfitability {
 public:
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                               OptimizationRemarkEmitter *ORE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+                               OptimizationRemarkEmitter *ORE,
+                               const std::optional<CostMapTy> &CM)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE), CostMap(CM) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap,
-                    std::unique_ptr<CacheCost> &CC);
+                    CharMatrix &DepMatrix);
 
 private:
   int getInstrOrderCost();
-  std::optional<bool> isProfitablePerLoopCacheAnalysis(
-      const DenseMap<const Loop *, unsigned> &CostMap,
-      std::unique_ptr<CacheCost> &CC);
+  std::optional<bool> isProfitablePerLoopCacheAnalysis();
   std::optional<bool> isProfitablePerInstrOrderCost();
   std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
                                                    unsigned OuterLoopId,
@@ -388,6 +387,8 @@ class LoopInterchangeProfitability {
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
+
+  const std::optional<CostMapTy> &CostMap;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -497,11 +498,13 @@ struct LoopInterchange {
     // indicates the loop should be placed as the innermost loop.
     //
     // For the old pass manager CacheCost would be null.
-    DenseMap<const Loop *, unsigned> CostMap;
+    std::optional<CostMapTy> CostMap = std::nullopt;
     if (CC != nullptr) {
+      CostMap = CostMapTy();
       const auto &LoopCosts = CC->getLoopCosts();
       for (unsigned i = 0; i < LoopCosts.size(); i++) {
-        CostMap[LoopCosts[i].first] = i;
+        const auto &Cost = LoopCosts[i];
+        (*CostMap)[Cost.first] = std::make_pair(i, Cost.second);
       }
     }
     // We try to achieve the globally optimal memory access for the loopnest,
@@ -537,7 +540,7 @@ struct LoopInterchange {
   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-                   const DenseMap<const Loop *, unsigned> &CostMap) {
+                   const std::optional<CostMapTy> &CostMap) {
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -546,9 +549,9 @@ struct LoopInterchange {
       return false;
     }
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE, CostMap);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap, CC)) {
+                          DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1127,29 +1130,60 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
 }
 
 std::optional<bool>
-LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
-    const DenseMap<const Loop *, unsigned> &CostMap,
-    std::unique_ptr<CacheCost> &CC) {
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis() {
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
-  if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
-    unsigned InnerIndex = 0, OuterIndex = 0;
-    InnerIndex = CostMap.find(InnerLoop)->second;
-    OuterIndex = CostMap.find(OuterLoop)->second;
-    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
-                      << ", OuterIndex = " << OuterIndex << "\n");
-    if (InnerIndex < OuterIndex)
-      return std::optional<bool>(true);
-    assert(InnerIndex != OuterIndex && "CostMap should assign unique "
-                                       "numbers to each loop");
-    if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
-      return std::nullopt;
-    return std::optional<bool>(false);
-  }
-  return std::nullopt;
+  if (!CostMap.has_value())
+    return std::nullopt;
+
+  auto InnerIte = CostMap->find(InnerLoop);
+  auto OuterIte = CostMap->find(OuterLoop);
+  if (InnerIte == CostMap->end() || OuterIte == CostMap->end())
+    return std::nullopt;
+
+  const auto &[InnerIndex, InnerCost] = InnerIte->second;
+  const auto &[OuterIndex, OuterCost] = OuterIte->second;
+  LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
+                    << ", OuterIndex = " << OuterIndex << "\n");
+  assert(InnerIndex != OuterIndex && "CostMap should assign unique "
+                                     "numbers to each loop");
+
+  if (InnerCost == OuterCost)
+    return std::nullopt;
+
+  return InnerIndex < OuterIndex;
 }
 
+// This function doesn't satisfy transitivity. Consider the following case.
+//
+// ```
+// for (int k = 0; k < N; k++) {
+//   for (int j = 0; j < N; j++) {
+//     for (int i = 0; i < N; i++) {
+//       dst0[i][j][k] += aa[i][j] + bb[i][j] + cc[j][k];
+//       dst1[k][j][i] += dd[i][j] + ee[i][j] + ff[j][k];
+//     }
+//   }
+// }
+//
+// ```
+//
+// The getInstrOrderCost will return the following value.
+//
+//  Outer | Inner | Cost
+// -------+-------+------
+//    k   |   j   |  -2
+//    j   |   i   |  -4
+//    k   |   i   |   0
+//
+// This means that this function says interchanging (k, j) loops and (j, i)
+// loops are profitable, but not (k, i). The root cause of this is that the
+// getInstrOrderCost only see the loops we are checking. We can resolve this if
+// we also consider the order going through other inductions. As for the above
+// case, we can induce that interchanging `k` and `i` is profitable (it is
+// better to move the `k` loop to inner position) by `bb[i][j]` and `cc[j][k]`.
+// However, such accurate calculation is expensive, so that we don't do it.
 std::optional<bool>
 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   // Legacy cost model: this is rough cost estimation algorithm. It counts the
@@ -1184,11 +1218,27 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
   return std::optional<bool>(!DepMatrix.empty());
 }
 
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap,
-    std::unique_ptr<CacheCost> &CC) {
+// The bubble-sort fashion algorithm is adopted to sort the loop nest, so the
+// comparison function should ideally induce a strict weak ordering required by
+// some standard C++ libraries. In particular, isProfitable should hold the
+// following properties.
+//
+// Asymmetry: If isProfitable(a, b) is true then isProfitable(b, a) is false.
+// Transitivity: If both isProfitable(a, b) and isProfitable(b, c) is true then
+// isProfitable(a, c) is true.
+//
+// The most important thing is not to make unprofitable interchange. From this
+// point of view, asymmetry is important. This is because if both
+// isProfitable(a, b) and isProfitable(b, a) are true, then an unprofitable
+// transformation (one of them) will be performed. On the other hand, a lack of
+// transitivity might cause some optimization opportunities to be lost, but
+// won't trigger an unprofitable one. Moreover, guaranteeing transitivity is
+// expensive. Therefore, isProfitable only holds the asymmetry.
+bool LoopInterchangeProfitability::isProfitable(const Loop *InnerLoop,
+                                                const Loop *OuterLoop,
+                                                unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
   // isProfitable() is structured to avoid endless loop interchange.
   // If loop cache analysis could decide the profitability then,
   // profitability check will stop and return the analysis result.
@@ -1197,15 +1247,14 @@ bool LoopInterchangeProfitability::isProfitable(
   // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
   // analysis the profitability then only, isProfitableForVectorization
   // will decide.
-  std::optional<bool> shouldInterchange =
-      isProfitablePerLoopCacheAnalysis(CostMap, CC);
-  if (!shouldInterchange.has_value()) {
-    shouldInterchange = isProfitablePerInstrOrderCost();
-    if (!shouldInterchange.has_value())
-      shouldInterchange =
+  std::optional<bool> ShouldInterchange = isProfitablePerLoopCacheAnalysis();
+  if (!ShouldInterchange.has_value()) {
+    ShouldInterchange = isProfitablePerInstrOrderCost();
+    if (!ShouldInterchange.has_value())
+      ShouldInterchange =
           isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
   }
-  if (!shouldInterchange.has_value()) {
+  if (!ShouldInterchange.has_value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                       InnerLoop->getStartLoc(),
@@ -1214,7 +1263,8 @@ bool LoopInterchangeProfitability::isProfitable(
                 "interchange.";
     });
     return false;
-  } else if (!shouldInterchange.value()) {
+  }
+  if (!ShouldInterchange.value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                       InnerLoop->getStartLoc(),
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll b/llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll
new file mode 100644
index 0000000000000..0e2c5cd60e292
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/profitability-redundant-interchange.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=1 -pass-remarks-output=%t -disable-output \
+; RUN:      -verify-dom-info -verify-loop-info
+; RUN: FileCheck -input-file %t %s
+
+
+; Test that the same pair of loops are not interchanged twice. This is the case
+; when the cost computed by CacheCost is the same for the loop of `j` and `k`.
+;
+; #define N 4
+; int a[N*N][N*N][N*N];
+; void f() {
+;   for (int i = 0; i < N; i++)
+;     for (int j = 1; j < 2*N; j++)
+;       for (int k = 1; k < 2*N; k++)
+;         a[i][k+1][j-1] -= a[i+N-1][k][j];
+; }
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+; CHECK-NEXT: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:       Cannot interchange loops due to dependences.
+; CHECK-NEXT: ...
+; CHECK-NEXT: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK-NEXT: ...
+
+ at a = dso_local local_unnamed_addr global [16 x [16 x [16 x i32]]] zeroinitializer, align 4
+
+define dso_local void @f() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv46 = phi i64 [ 0, %entry ], [ %indvars.iv.next47, %for.cond.cleanup3 ]
+  %0 = add nuw nsw i64 %indvars.iv46, 3
+  br label %for.cond5.preheader
+
+for.cond5.preheader:
+  %indvars.iv41 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
+  %1 = add nsw i64 %indvars.iv41, -1
+  br label %for.body8
+
+for.cond.cleanup3:
+  %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+  %exitcond50 = icmp ne i64 %indvars.iv.next47, 4
+  br i1 %exitcond50, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond.cleanup7:
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next42, 8
+  br i1 %exitcond45, label %for.cond5.preheader, label %for.cond.cleanup3
+
+for.body8:
+  %indvars.iv = phi i64 [ 1, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx12 = getelementptr inbounds nuw [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %0, i64 %indvars.iv, i64 %indvars.iv41
+  %2 = load i32, ptr %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx20 = getelementptr inbounds [16 x [16 x [16 x i32]]], ptr @a, i64 0, i64 %indvars.iv46, i64 %indvars.iv.next, i64 %1
+  %3 = load i32, ptr %arrayidx20, align 4
+  %sub21 = sub nsw i32 %3, %2
+  store i32 %sub21, ptr %arrayidx20, align 4
+  %exitcond = icmp ne i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.body8, label %for.cond.cleanup7
+
+for.cond.cleanup:
+  ret void
+}

>From 10241901d819b0aab36779e58e44e856c172560c Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 20 Feb 2025 06:29:54 +0000
Subject: [PATCH 2/2] Address review comments

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 57 ++++---------------
 1 file changed, 11 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 056b69c4e2104..5299a207f897e 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -388,6 +388,9 @@ class LoopInterchangeProfitability {
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
+  /// Map each loop to a pair of (index, cost). The index is the best position
+  /// of this loop, according to CacheCostAnalysis. The cost is the actual cost
+  /// as calculated by CacheCostAnalysis.
   const std::optional<CostMapTy> &CostMap;
 };
 
@@ -1149,41 +1152,13 @@ LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis() {
   assert(InnerIndex != OuterIndex && "CostMap should assign unique "
                                      "numbers to each loop");
 
+  // If the costs are equal, no decision is made. If not, a decision is made
+  // according to the index calculated by CacheCostAnalysis.
   if (InnerCost == OuterCost)
     return std::nullopt;
-
   return InnerIndex < OuterIndex;
 }
 
-// This function doesn't satisfy transitivity. Consider the following case.
-//
-// ```
-// for (int k = 0; k < N; k++) {
-//   for (int j = 0; j < N; j++) {
-//     for (int i = 0; i < N; i++) {
-//       dst0[i][j][k] += aa[i][j] + bb[i][j] + cc[j][k];
-//       dst1[k][j][i] += dd[i][j] + ee[i][j] + ff[j][k];
-//     }
-//   }
-// }
-//
-// ```
-//
-// The getInstrOrderCost will return the following value.
-//
-//  Outer | Inner | Cost
-// -------+-------+------
-//    k   |   j   |  -2
-//    j   |   i   |  -4
-//    k   |   i   |   0
-//
-// This means that this function says interchanging (k, j) loops and (j, i)
-// loops are profitable, but not (k, i). The root cause of this is that the
-// getInstrOrderCost only see the loops we are checking. We can resolve this if
-// we also consider the order going through other inductions. As for the above
-// case, we can induce that interchanging `k` and `i` is profitable (it is
-// better to move the `k` loop to inner position) by `bb[i][j]` and `cc[j][k]`.
-// However, such accurate calculation is expensive, so that we don't do it.
 std::optional<bool>
 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
   // Legacy cost model: this is rough cost estimation algorithm. It counts the
@@ -1218,22 +1193,6 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
   return std::optional<bool>(!DepMatrix.empty());
 }
 
-// The bubble-sort fashion algorithm is adopted to sort the loop nest, so the
-// comparison function should ideally induce a strict weak ordering required by
-// some standard C++ libraries. In particular, isProfitable should hold the
-// following properties.
-//
-// Asymmetry: If isProfitable(a, b) is true then isProfitable(b, a) is false.
-// Transitivity: If both isProfitable(a, b) and isProfitable(b, c) is true then
-// isProfitable(a, c) is true.
-//
-// The most important thing is not to make unprofitable interchange. From this
-// point of view, asymmetry is important. This is because if both
-// isProfitable(a, b) and isProfitable(b, a) are true, then an unprofitable
-// transformation (one of them) will be performed. On the other hand, a lack of
-// transitivity might cause some optimization opportunities to be lost, but
-// won't trigger an unprofitable one. Moreover, guaranteeing transitivity is
-// expensive. Therefore, isProfitable only holds the asymmetry.
 bool LoopInterchangeProfitability::isProfitable(const Loop *InnerLoop,
                                                 const Loop *OuterLoop,
                                                 unsigned InnerLoopId,
@@ -1247,6 +1206,12 @@ bool LoopInterchangeProfitability::isProfitable(const Loop *InnerLoop,
   // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
   // analysis the profitability then only, isProfitableForVectorization
   // will decide.
+  //
+  // The bubble sort fashion algorithm is used to sort the loop nest and this
+  // function is utilized like the "comparison function". Once this function
+  // returns true for a loop pair A and B, it must never return true for a pair
+  // B and A. If this happens, it triggers the LoopInterchange to undo its own
+  // transformation.
   std::optional<bool> ShouldInterchange = isProfitablePerLoopCacheAnalysis();
   if (!ShouldInterchange.has_value()) {
     ShouldInterchange = isProfitablePerInstrOrderCost();