[llvm] ee7188c - [LoopInterchange] Correcting the profitability check

Mon Jan 16 11:36:41 PST 2023

Author: Ram-NK
Date: 2023-01-16T14:36:06-05:00
New Revision: ee7188c8b2ab06e740470cee368b137e0781958c

URL: https://github.com/llvm/llvm-project/commit/ee7188c8b2ab06e740470cee368b137e0781958c
DIFF: https://github.com/llvm/llvm-project/commit/ee7188c8b2ab06e740470cee368b137e0781958c.diff

LOG: [LoopInterchange] Correcting the profitability check

Before D135808, There would be endless loop interchange posibility (no
proper priority was there in profitability check. Any profitable check
may leads to loop-interchange). With this patch, there  is no endless
interchange (priority in profitable check is defined. Order of decision
is 'Cache cost' check, 'InstrOrderCost', 'Vectorization'). Corrected the
dependency checking inside isProfitableForVectorization(), corrected the
checking of bad order loops in isProfitablePerInstrOrderCost().

Reviewed By: Meinersbur, bmahjour, #loopoptwg

Differential Revision: https://reviews.llvm.org/D135808

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopInterchange.cpp
    llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
    llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
    llvm/test/Transforms/LoopInterchange/pr57148.ll
    llvm/test/Transforms/LoopInterchange/profitability.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 84de61490f763..0a7c62113c7f6 100644

--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -311,11 +311,18 @@ class LoopInterchangeProfitability {
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap);
+                    const DenseMap<const Loop *, unsigned> &CostMap,
+                    std::unique_ptr<CacheCost> &CC);
 
 private:
   int getInstrOrderCost();
-
+  std::optional<bool> isProfitablePerLoopCacheAnalysis(
+      const DenseMap<const Loop *, unsigned> &CostMap,
+      std::unique_ptr<CacheCost> &CC);
+  std::optional<bool> isProfitablePerInstrOrderCost();
+  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+                                                   unsigned OuterLoopId,
+                                                   CharMatrix &DepMatrix);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -512,7 +519,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1091,31 +1098,10 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
   return GoodOrder - BadOrder;
 }
 
-static bool isProfitableForVectorization(unsigned InnerLoopId,
-                                         unsigned OuterLoopId,
-                                         CharMatrix &DepMatrix) {
-  // TODO: Improve this heuristic to catch more cases.
-  // If the inner loop is loop independent or doesn't carry any dependency it is
-  // profitable to move this to outer position.
-  for (auto &Row : DepMatrix) {
-    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
-      return false;
-    // TODO: We need to improve this heuristic.
-    if (Row[OuterLoopId] != '=')
-      return false;
-  }
-  // If outer loop has dependence and inner loop is loop independent then it is
-  // profitable to interchange to enable parallelism.
-  // If there are no dependences, interchanging will not improve anything.
-  return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap) {
-  // TODO: Remove the legacy cost model.
-
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost> &CC) {
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
@@ -1127,30 +1113,91 @@ bool LoopInterchangeProfitability::isProfitable(
     LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
                       << ", OuterIndex = " << OuterIndex << "\n");
     if (InnerIndex < OuterIndex)
-      return true;
-  } else {
-    // Legacy cost model: this is rough cost estimation algorithm. It counts the
-    // good and bad order of induction variables in the instruction and allows
-    // reordering if number of bad orders is more than good.
-    int Cost = getInstrOrderCost();
-    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-    if (Cost < -LoopInterchangeCostThreshold)
-      return true;
+      return std::optional<bool>(true);
+    assert(InnerIndex != OuterIndex && "CostMap should assign unique "
+                                       "numbers to each loop");
+    if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
+      return std::nullopt;
+    return std::optional<bool>(false);
   }
+  return std::nullopt;
+}
 
-  // It is not profitable as per current cache profitability model. But check if
-  // we can move this loop outside to improve parallelism.
-  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
-    return true;
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
+  // Legacy cost model: this is rough cost estimation algorithm. It counts the
+  // good and bad order of induction variables in the instruction and allows
+  // reordering if number of bad orders is more than good.
+  int Cost = getInstrOrderCost();
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
+    return std::optional<bool>(true);
+
+  return std::nullopt;
+}
 
-  ORE->emit([&]() {
-    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
-                                    InnerLoop->getStartLoc(),
-                                    InnerLoop->getHeader())
-           << "Interchanging loops is too costly and it does not improve "
-              "parallelism.";
-  });
-  return false;
+std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
+  for (auto &Row : DepMatrix) {
+    // If the inner loop is loop independent or doesn't carry any dependency
+    // it is not profitable to move this to outer position, since we are
+    // likely able to do inner loop vectorization already.
+    if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
+      return std::optional<bool>(false);
+
+    // If the outer loop is not loop independent it is not profitable to move
+    // this to inner position, since doing so would not enable inner loop
+    // parallelism.
+    if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
+      return std::optional<bool>(false);
+  }
+  // If inner loop has dependence and outer loop is loop independent then it
+  // is/ profitable to interchange to enable inner loop parallelism.
+  // If there are no dependences, interchanging will not improve anything.
+  return std::optional<bool>(!DepMatrix.empty());
+}
+
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost> &CC) {
+  // isProfitable() is structured to avoid endless loop interchange.
+  // If loop cache analysis could decide the profitability then,
+  // profitability check will stop and return the analysis result.
+  // If cache analysis failed to analyze the loopnest (e.g.,
+  // due to delinearization issues) then only check whether it is
+  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
+  // analysis the profitability then only, isProfitableForVectorization
+  // will decide.
+  std::optional<bool> shouldInterchange =
+      isProfitablePerLoopCacheAnalysis(CostMap, CC);
+  if (!shouldInterchange.has_value()) {
+    shouldInterchange = isProfitablePerInstrOrderCost();
+    if (!shouldInterchange.has_value())
+      shouldInterchange =
+          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+  }
+  if (!shouldInterchange.has_value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Insufficient information to calculate the cost of loop for "
+                "interchange.";
+    });
+    return false;
+  } else if (!shouldInterchange.value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Interchanging loops is not considered to improve cache "
+                "locality nor vectorization.";
+    });
+    return false;
+  }
+  return true;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,

diff  --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
index 24f0121a3beae..3c7828a49477c 100644
--- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -71,7 +71,7 @@ for.end19:
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          Interchanging loops is too costly and it does not improve parallelism.
+; DELIN-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
 ; DELIN-NEXT: ...
 
 ;;--------------------------------------Test case 02------------------------------------

diff  --git a/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll b/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
index 664a3c9c92ad4..11d79c0d86f2e 100644
--- a/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
@@ -156,20 +156,29 @@ exit:                                             ; preds = %outer.latch
 
 ; Make sure we do not crash for loops without reachable exits.
 define void @no_reachable_exits() {
-; Check we interchanged.
-; CHECK-LABEL: @no_reachable_exits() {
+; Check we do not crash.
+; CHECK-LABEL: @no_reachable_exits(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %inner.ph
-; CHECK-LABEL: outer.ph:
-; CHECK-NEXT:    br label %outer.header
-; CHECK-LABEL: inner.ph:
-; CHECK-NEXT:    br label %inner.body
-; CHECK-LABEL: inner.body:
-; CHECK-NEXT:    %tmp31 = phi i32 [ 0, %inner.ph ], [ %[[IVNEXT:[0-9]]], %inner.body.split ]
-; CHECK-NEXT:    br label %outer.ph
-; CHECK-LABEL: inner.body.split:
-; CHECK-NEXT:    %[[IVNEXT]] = add nsw i32 %tmp31, 1
-; CHECK-NEXT:    br i1 false, label %inner.body, label %exit
+; CHECK-NEXT:    br label [[OUTER_PH:%.*]]
+; CHECK:       outer.ph:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ 0, [[OUTER_PH]] ], [ [[TMP8:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[INNER_PH:%.*]], label [[OUTER_LATCH]]
+; CHECK:       inner.ph:
+; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
+; CHECK:       inner.body:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i32 [ 0, [[INNER_PH]] ], [ [[TMP6:%.*]], [[INNER_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr undef, align 8
+; CHECK-NEXT:    [[TMP6]] = add nsw i32 [[TMP31]], 1
+; CHECK-NEXT:    br i1 false, label [[INNER_BODY]], label [[OUTER_LATCH_LOOPEXIT:%.*]]
+; CHECK:       outer.latch.loopexit:
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    unreachable
 
 
 bb:

diff  --git a/llvm/test/Transforms/LoopInterchange/pr57148.ll b/llvm/test/Transforms/LoopInterchange/pr57148.ll
index 362aa93dcc566..e9e78dfe83ed0 100644
--- a/llvm/test/Transforms/LoopInterchange/pr57148.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr57148.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
 ; Make sure the loops are in LCSSA form after loop interchange,
 ; and loop interchange does not hit assertion errors and crash.

diff  --git a/llvm/test/Transforms/LoopInterchange/profitability.ll b/llvm/test/Transforms/LoopInterchange/profitability.ll
index b0b1f8772a666..3ee106c2e3c4d 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability.ll
@@ -2,6 +2,10 @@
 ; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange
 ; RUN: FileCheck -input-file %t %s
 
+; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck --check-prefix=PROFIT %s
+
 ;; We test profitability model in these test cases.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [100 x [100 x i32]] zeroinitializer
 
 ;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; Loops interchange will result in better cache locality and hence profitable. Check for interchange.
 ;;   for(int i=1;i<100;i++)
 ;;     for(int j=1;j<100;j++)
 ;;       A[j][i] = A[j - 1][i] + B[j][i];
@@ -162,3 +166,60 @@ for.inc10:
 for.end12:
   ret void
 }
+
+;;---------------------------------------Test case 05---------------------------------
+;; This test is to make sure, that multiple invocations of loop interchange will not
+;; undo previous interchange and will converge to a particular order determined by the
+;; profitability analysis.
+;;   for(int i=1;i<100;i++)
+;;     for(int j=1;j<100;j++)
+;;       A[j][0] = A[j][0] + B[j][i];
+
+; CHECK:      Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_05
+
+; PROFIT-LABEL: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-LABEL: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+define void @interchange_05() {
+entry:
+  br label %for2.preheader
+
+for2.preheader:
+  %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ]
+  %j.prev = add nsw i64 %j,  -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+  %lv1 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j,  i64 %i30
+  %lv2 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %lv1, %lv2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j,  i64 0
+  store i32 %add, i32* %arrayidx13
+  %i.next = add nuw nsw i64 %j,  1
+  %exitcond = icmp eq i64 %j,  99
+  br i1 %exitcond, label %for1.inc14, label %for2
+
+for1.inc14:
+  %i.next31 = add nuw nsw i64 %i30, 1
+  %exitcond33 = icmp eq i64 %i30, 99
+  br i1 %exitcond33, label %for.end16, label %for2.preheader
+
+for.end16:
+  ret void
+}
+