[llvm] ef4ecc3 - [LoopCacheAnalysis] Consider dimension depth of the subscript reference when calculating cost

Mon May 2 13:51:17 PDT 2022

Author: Bardia Mahjour
Date: 2022-05-02T16:49:10-04:00
New Revision: ef4ecc3ceffcf3ef129640c813f823c974f9ba22

URL: https://github.com/llvm/llvm-project/commit/ef4ecc3ceffcf3ef129640c813f823c974f9ba22
DIFF: https://github.com/llvm/llvm-project/commit/ef4ecc3ceffcf3ef129640c813f823c974f9ba22.diff

LOG: [LoopCacheAnalysis] Consider dimension depth of the subscript reference when calculating cost

Reviewed By: congzhe, etiotto

Differential Revision: https://reviews.llvm.org/D123400

Added: 
    llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll

Modified: 
    llvm/include/llvm/Analysis/LoopCacheAnalysis.h
    llvm/lib/Analysis/LoopCacheAnalysis.cpp
    llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
    llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
    llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
index 2c1664743e36f..34d5ce5e6b004 100644

--- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -111,6 +111,13 @@ class IndexedReference {
   /// smaller than the cache line size \p CLS.
   bool isConsecutive(const Loop &L, unsigned CLS) const;
 
+  /// Retrieve the index of the subscript corresponding to the given loop \p
+  /// L. Return a zero-based positive index if the subscript index is
+  /// succesfully located and a negative value otherwise. For example given the
+  /// indexed reference 'A[i][2j+1][3k+2]', the call
+  /// 'getSubscriptIndex(loop-k)' would return value 2.
+  unsigned getSubscriptIndex(const Loop &L) const;
+
   /// Return the coefficient used in the rightmost dimension.
   const SCEV *getLastCoefficient() const;
 

diff  --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index b7806b3614cb7..8ebf77b21afbf 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
   return StepRec == &ElemSize;
 }
 
-/// Compute the trip count for the given loop \p L. Return the SCEV expression
-/// for the trip count or nullptr if it cannot be computed.
-static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+/// Compute the trip count for the given loop \p L or assume a default value if
+/// it is not a compile time constant. Return the SCEV expression for the trip
+/// count.
+static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
+                                    ScalarEvolution &SE) {
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
-      !isa<SCEVConstant>(BackedgeTakenCount))
-    return nullptr;
-  return SE.getTripCountFromExitCount(BackedgeTakenCount);
+  const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+                           isa<SCEVConstant>(BackedgeTakenCount))
+                              ? SE.getTripCountFromExitCount(BackedgeTakenCount)
+                              : nullptr;
+
+  if (!TripCount) {
+    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+               << " could not be computed, using DefaultTripCount\n");
+    TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
+  }
+
+  return TripCount;
 }
 
 //===----------------------------------------------------------------------===//
@@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     return 1;
   }
 
-  const SCEV *TripCount = computeTripCount(L, SE);
-  if (!TripCount) {
-    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
-                      << " could not be computed, using DefaultTripCount\n");
-    const SCEV *ElemSize = Sizes.back();
-    TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
-  }
+  const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
+  assert(TripCount && "Expecting valid TripCount");
   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
 
-  // If the indexed reference is 'consecutive' the cost is
-  // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
-  const SCEV *RefCost = TripCount;
-
+  const SCEV *RefCost = nullptr;
   if (isConsecutive(L, CLS)) {
+    // If the indexed reference is 'consecutive' the cost is
+    // (TripCount*Stride)/CLS.
     const SCEV *Coeff = getLastCoefficient();
     const SCEV *ElemSize = Sizes.back();
+    assert(Coeff->getType() == ElemSize->getType() &&
+           "Expecting the same type");
     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
     const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
@@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
                << *RefCost << "\n");
-  } else
+  } else {
+    // If the indexed reference is not 'consecutive' the cost is proportional to
+    // the trip count and the depth of the dimension which the subject loop
+    // subscript is accessing. We try to estimate this by multiplying the cost
+    // by the trip counts of loops corresponding to the inner dimensions. For
+    // example, given the indexed reference 'A[i][j][k]', and assuming the
+    // i-loop is in the innermost position, the cost would be equal to the
+    // iterations of the i-loop multiplied by iterations of the j-loop.
+    RefCost = TripCount;
+
+    unsigned Index = getSubscriptIndex(L);
+    assert(Index >= 0 && "Cound not locate a valid Index");
+
+    for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
+      assert(AR && AR->getLoop() && "Expecting valid loop");
+      const SCEV *TripCount =
+          computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
+      Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+      RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+                              SE.getNoopOrAnyExtend(TripCount, WiderType));
+    }
+
     LLVM_DEBUG(dbgs().indent(4)
-               << "Access is not consecutive: RefCost=TripCount=" << *RefCost
-               << "\n");
+               << "Access is not consecutive: RefCost=" << *RefCost << "\n");
+  }
+  assert(RefCost && "Expecting a valid RefCost");
 
   // Attempt to fold RefCost into a constant.
   if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
@@ -481,6 +510,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
 }
 
+unsigned IndexedReference::getSubscriptIndex(const Loop &L) const {
+  for (auto Idx : seq<unsigned>(0, getNumSubscripts())) {
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
+    if (AR && AR->getLoop() == &L) {
+      return Idx;
+    }
+  }
+  return -1;
+}
+
 const SCEV *IndexedReference::getLastCoefficient() const {
   const SCEV *LastSubscript = getLastSubscript();
   auto *AR = cast<SCEVAddRecExpr>(LastSubscript);

diff  --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index fa85b71ce094f..bb868b5d43503 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,14 +83,13 @@ for.end13:                                        ; preds = %for.inc11
 
 declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 4472886244958208
-; CHECK: Loop 'for.body4' has cost = 4472886244958208
-; CHECK: Loop 'for.body8' has cost = 4472886244958208
-; CHECK: Loop 'for.body12' has cost = 4472886244958208
-; CHECK: Loop 'for.body16' has cost = 137728168833024
+; CHECK: Loop 'for.body' has cost = 2112128815104000000
+; CHECK: Loop 'for.body4' has cost = 16762927104000000
+; CHECK: Loop 'for.body8' has cost = 130960368000000
+; CHECK: Loop 'for.body12' has cost = 1047682944000
+; CHECK: Loop 'for.body16' has cost = 32260032000
 
-
-;; #define N 1024
+;; #define N 128
 ;; #define M 2048
 ;; void t3(int a[][N][N][N][M]) {
 ;;   for (int i1 = 0; i1 < N-1; ++i1)
@@ -101,7 +100,7 @@ declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
 ;;             a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2];
 ;; }
 
-define void @t3([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) {
+define void @t3([128 x [128 x [128 x [2048 x i32]]]]* %a) {
 entry:
   br label %for.body
 
@@ -127,9 +126,9 @@ for.body16:                                       ; preds = %for.body12, %for.bo
   %1 = add nsw i64 %indvars.iv14, -2
   %2 = add nsw i64 %indvars.iv7, -3
   %3 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
+  %arrayidx26 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
   %4 = load i32, i32* %arrayidx26, align 4
-  %arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
+  %arrayidx36 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
   store i32 %4, i32* %arrayidx36, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp ne i64 %indvars.iv.next, 2046
@@ -137,22 +136,22 @@ for.body16:                                       ; preds = %for.body12, %for.bo
 
 for.inc37:                                        ; preds = %for.body16
   %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
-  %exitcond10 = icmp ne i64 %indvars.iv.next8, 1024
+  %exitcond10 = icmp ne i64 %indvars.iv.next8, 128
   br i1 %exitcond10, label %for.body12, label %for.inc40
 
 for.inc40:                                        ; preds = %for.inc37
   %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
-  %exitcond13 = icmp ne i64 %indvars.iv.next12, 1024
+  %exitcond13 = icmp ne i64 %indvars.iv.next12, 128
   br i1 %exitcond13, label %for.body8, label %for.inc43
 
 for.inc43:                                        ; preds = %for.inc40
   %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1
-  %exitcond17 = icmp ne i64 %indvars.iv.next15, 1024
+  %exitcond17 = icmp ne i64 %indvars.iv.next15, 128
   br i1 %exitcond17, label %for.body4, label %for.inc46
 
 for.inc46:                                        ; preds = %for.inc43
   %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
-  %exitcond21 = icmp ne i64 %indvars.iv.next19, 1023
+  %exitcond21 = icmp ne i64 %indvars.iv.next19, 127
   br i1 %exitcond21, label %for.body, label %for.end48
 
 for.end48:                                        ; preds = %for.inc46

diff  --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
index c843337e0c1e1..2b317c02b7fb8 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -14,9 +14,9 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ;                 y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m]
 ; }
 
-; CHECK: Loop 'k_loop' has cost = 30000000000
-; CHECK: Loop 'j_loop' has cost = 30000000000
-; CHECK: Loop 'i_loop' has cost = 30000000000
+; CHECK: Loop 'k_loop' has cost = 10200000000000000
+; CHECK: Loop 'j_loop' has cost = 102000000000000
+; CHECK: Loop 'i_loop' has cost = 1020000000000
 ; CHECK: Loop 'm_loop' has cost = 10700000000
 ; CHECK: Loop 'l_loop' has cost = 1300000000
 

diff  --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
new file mode 100644
index 0000000000000..89d959e927840
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s  -opaque-pointers -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-DAG: Loop 'for.j' has cost = 201000000
+; CHECK-DAG: Loop 'for.i' has cost = 102000000
+; CHECK-DAG: Loop 'for.k' has cost = 90000
+
+;; Test to make sure when we have multiple conflicting access patterns, the 
+;; chosen loop configuration favours the majority of those accesses. 
+;; For example this nest should be ordered as j-i-k.
+;;  for (int i = 0; i < n; i++)
+;;    for (int j = 0; j < n; j++)
+;;      for (int k = 0; k < n; k++) {
+;;        A[i][j][k] = 1;
+;;        B[j][i][k] = 2;
+;;        C[j][i][k] = 3;
+;;      }                            
+
+define void @foo(i32 noundef signext %n, ptr noalias noundef %A, ptr noalias noundef %B, ptr noalias noundef %C) {
+entry:
+  %0 = zext i32 %n to i64
+  %1 = zext i32 %n to i64
+  %2 = zext i32 %n to i64
+  %3 = zext i32 %n to i64
+  %4 = zext i32 %n to i64
+  %5 = zext i32 %n to i64
+  %cmp5 = icmp sgt i32 %n, 0
+  br i1 %cmp5, label %for.i.preheader, label %for.end30
+
+for.i.preheader:                               ; preds = %entry
+  %wide.trip.count16 = zext i32 %n to i64
+  br label %for.i
+
+for.i:                                         ; preds = %for.i.preheader, %for.inc28
+  %indvars.iv13 = phi i64 [ 0, %for.i.preheader ], [ %indvars.iv.next14, %for.inc28 ]
+  %cmp23 = icmp sgt i32 %n, 0
+  br i1 %cmp23, label %for.j.preheader, label %for.inc28
+
+for.j.preheader:                              ; preds = %for.i
+  %wide.trip.count11 = zext i32 %n to i64
+  br label %for.j
+
+for.j:                                        ; preds = %for.j.preheader, %for.inc25
+  %indvars.iv8 = phi i64 [ 0, %for.j.preheader ], [ %indvars.iv.next9, %for.inc25 ]
+  %cmp61 = icmp sgt i32 %n, 0
+  br i1 %cmp61, label %for.k.preheader, label %for.inc25
+
+for.k.preheader:                              ; preds = %for.j
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.k
+
+for.k:                                        ; preds = %for.k.preheader, %for.k
+  %indvars.iv = phi i64 [ 0, %for.k.preheader ], [ %indvars.iv.next, %for.k ]
+  %6 = mul nuw i64 %0, %1
+  %7 = mul nsw i64 %6, %indvars.iv13
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %7
+  %8 = mul nuw nsw i64 %indvars.iv8, %1
+  %arrayidx10 = getelementptr inbounds i32, ptr %arrayidx, i64 %8
+  %arrayidx12 = getelementptr inbounds i32, ptr %arrayidx10, i64 %indvars.iv
+  store i32 1, ptr %arrayidx12, align 4
+  %9 = mul nuw i64 %2, %3
+  %10 = mul nsw i64 %9, %indvars.iv8
+  %arrayidx14 = getelementptr inbounds i32, ptr %B, i64 %10
+  %11 = mul nuw nsw i64 %indvars.iv13, %3
+  %arrayidx16 = getelementptr inbounds i32, ptr %arrayidx14, i64 %11
+  %arrayidx18 = getelementptr inbounds i32, ptr %arrayidx16, i64 %indvars.iv
+  store i32 2, ptr %arrayidx18, align 4
+  %12 = mul nuw i64 %4, %5
+  %13 = mul nsw i64 %12, %indvars.iv8
+  %arrayidx20 = getelementptr inbounds i32, ptr %C, i64 %13
+  %14 = mul nuw nsw i64 %indvars.iv13, %5
+  %arrayidx22 = getelementptr inbounds i32, ptr %arrayidx20, i64 %14
+  %arrayidx24 = getelementptr inbounds i32, ptr %arrayidx22, i64 %indvars.iv
+  store i32 3, ptr %arrayidx24, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.k, label %for.inc25.loopexit
+
+for.inc25.loopexit:                               ; preds = %for.k
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.inc25.loopexit, %for.j
+  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
+  %exitcond12 = icmp ne i64 %indvars.iv.next9, %wide.trip.count11
+  br i1 %exitcond12, label %for.j, label %for.inc28.loopexit
+
+for.inc28.loopexit:                               ; preds = %for.inc25
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.inc28.loopexit, %for.i
+  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+  %exitcond17 = icmp ne i64 %indvars.iv.next14, %wide.trip.count16
+  br i1 %exitcond17, label %for.i, label %for.end30.loopexit
+
+for.end30.loopexit:                               ; preds = %for.inc28
+  br label %for.end30
+
+for.end30:                                        ; preds = %for.end30.loopexit, %entry
+  ret void
+}

diff  --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
index 23bd1e1fdbc9f..8aa247b867385 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -10,7 +10,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ;         A[2*i+3][3*j-4][2*k+7] = 1;
 ; }
 
-; CHECK: Loop 'for.i' has cost = 1000000
+; CHECK: Loop 'for.i' has cost = 100000000
 ; CHECK: Loop 'for.j' has cost = 1000000
 ; CHECK: Loop 'for.k' has cost = 60000