[llvm] ef4ecc3 - [LoopCacheAnalysis] Consider dimension depth of the subscript reference when calculating cost
Bardia Mahjour via llvm-commits
llvm-commits at lists.llvm.org
Mon May 2 13:51:17 PDT 2022
Author: Bardia Mahjour
Date: 2022-05-02T16:49:10-04:00
New Revision: ef4ecc3ceffcf3ef129640c813f823c974f9ba22
URL: https://github.com/llvm/llvm-project/commit/ef4ecc3ceffcf3ef129640c813f823c974f9ba22
DIFF: https://github.com/llvm/llvm-project/commit/ef4ecc3ceffcf3ef129640c813f823c974f9ba22.diff
LOG: [LoopCacheAnalysis] Consider dimension depth of the subscript reference when calculating cost
Reviewed By: congzhe, etiotto
Differential Revision: https://reviews.llvm.org/D123400
Added:
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
Modified:
llvm/include/llvm/Analysis/LoopCacheAnalysis.h
llvm/lib/Analysis/LoopCacheAnalysis.cpp
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
index 2c1664743e36f..34d5ce5e6b004 100644
--- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -111,6 +111,13 @@ class IndexedReference {
/// smaller than the cache line size \p CLS.
bool isConsecutive(const Loop &L, unsigned CLS) const;
+ /// Retrieve the index of the subscript corresponding to the given loop \p
+ /// L. Return a zero-based positive index if the subscript index is
+ /// succesfully located and a negative value otherwise. For example given the
+ /// indexed reference 'A[i][2j+1][3k+2]', the call
+ /// 'getSubscriptIndex(loop-k)' would return value 2.
+ unsigned getSubscriptIndex(const Loop &L) const;
+
/// Return the coefficient used in the rightmost dimension.
const SCEV *getLastCoefficient() const;
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index b7806b3614cb7..8ebf77b21afbf 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
return StepRec == &ElemSize;
}
-/// Compute the trip count for the given loop \p L. Return the SCEV expression
-/// for the trip count or nullptr if it cannot be computed.
-static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+/// Compute the trip count for the given loop \p L or assume a default value if
+/// it is not a compile time constant. Return the SCEV expression for the trip
+/// count.
+static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
+ ScalarEvolution &SE) {
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
- !isa<SCEVConstant>(BackedgeTakenCount))
- return nullptr;
- return SE.getTripCountFromExitCount(BackedgeTakenCount);
+ const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+ isa<SCEVConstant>(BackedgeTakenCount))
+ ? SE.getTripCountFromExitCount(BackedgeTakenCount)
+ : nullptr;
+
+ if (!TripCount) {
+ LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+ << " could not be computed, using DefaultTripCount\n");
+ TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
+ }
+
+ return TripCount;
}
//===----------------------------------------------------------------------===//
@@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
return 1;
}
- const SCEV *TripCount = computeTripCount(L, SE);
- if (!TripCount) {
- LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
- << " could not be computed, using DefaultTripCount\n");
- const SCEV *ElemSize = Sizes.back();
- TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
- }
+ const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
+ assert(TripCount && "Expecting valid TripCount");
LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
- // If the indexed reference is 'consecutive' the cost is
- // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
- const SCEV *RefCost = TripCount;
-
+ const SCEV *RefCost = nullptr;
if (isConsecutive(L, CLS)) {
+ // If the indexed reference is 'consecutive' the cost is
+ // (TripCount*Stride)/CLS.
const SCEV *Coeff = getLastCoefficient();
const SCEV *ElemSize = Sizes.back();
+ assert(Coeff->getType() == ElemSize->getType() &&
+ "Expecting the same type");
const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
@@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
LLVM_DEBUG(dbgs().indent(4)
<< "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
<< *RefCost << "\n");
- } else
+ } else {
+ // If the indexed reference is not 'consecutive' the cost is proportional to
+ // the trip count and the depth of the dimension which the subject loop
+ // subscript is accessing. We try to estimate this by multiplying the cost
+ // by the trip counts of loops corresponding to the inner dimensions. For
+ // example, given the indexed reference 'A[i][j][k]', and assuming the
+ // i-loop is in the innermost position, the cost would be equal to the
+ // iterations of the i-loop multiplied by iterations of the j-loop.
+ RefCost = TripCount;
+
+ unsigned Index = getSubscriptIndex(L);
+ assert(Index >= 0 && "Cound not locate a valid Index");
+
+ for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
+ assert(AR && AR->getLoop() && "Expecting valid loop");
+ const SCEV *TripCount =
+ computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
+ Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+ RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+ SE.getNoopOrAnyExtend(TripCount, WiderType));
+ }
+
LLVM_DEBUG(dbgs().indent(4)
- << "Access is not consecutive: RefCost=TripCount=" << *RefCost
- << "\n");
+ << "Access is not consecutive: RefCost=" << *RefCost << "\n");
+ }
+ assert(RefCost && "Expecting a valid RefCost");
// Attempt to fold RefCost into a constant.
if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
@@ -481,6 +510,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
}
+unsigned IndexedReference::getSubscriptIndex(const Loop &L) const {
+ for (auto Idx : seq<unsigned>(0, getNumSubscripts())) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
+ if (AR && AR->getLoop() == &L) {
+ return Idx;
+ }
+ }
+ return -1;
+}
+
const SCEV *IndexedReference::getLastCoefficient() const {
const SCEV *LastSubscript = getLastSubscript();
auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index fa85b71ce094f..bb868b5d43503 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,14 +83,13 @@ for.end13: ; preds = %for.inc11
declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
-; CHECK: Loop 'for.body' has cost = 4472886244958208
-; CHECK: Loop 'for.body4' has cost = 4472886244958208
-; CHECK: Loop 'for.body8' has cost = 4472886244958208
-; CHECK: Loop 'for.body12' has cost = 4472886244958208
-; CHECK: Loop 'for.body16' has cost = 137728168833024
+; CHECK: Loop 'for.body' has cost = 2112128815104000000
+; CHECK: Loop 'for.body4' has cost = 16762927104000000
+; CHECK: Loop 'for.body8' has cost = 130960368000000
+; CHECK: Loop 'for.body12' has cost = 1047682944000
+; CHECK: Loop 'for.body16' has cost = 32260032000
-
-;; #define N 1024
+;; #define N 128
;; #define M 2048
;; void t3(int a[][N][N][N][M]) {
;; for (int i1 = 0; i1 < N-1; ++i1)
@@ -101,7 +100,7 @@ declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
;; a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2];
;; }
-define void @t3([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) {
+define void @t3([128 x [128 x [128 x [2048 x i32]]]]* %a) {
entry:
br label %for.body
@@ -127,9 +126,9 @@ for.body16: ; preds = %for.body12, %for.bo
%1 = add nsw i64 %indvars.iv14, -2
%2 = add nsw i64 %indvars.iv7, -3
%3 = add nuw nsw i64 %indvars.iv, 2
- %arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
+ %arrayidx26 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
%4 = load i32, i32* %arrayidx26, align 4
- %arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
+ %arrayidx36 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
store i32 %4, i32* %arrayidx36, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 2046
@@ -137,22 +136,22 @@ for.body16: ; preds = %for.body12, %for.bo
for.inc37: ; preds = %for.body16
%indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
- %exitcond10 = icmp ne i64 %indvars.iv.next8, 1024
+ %exitcond10 = icmp ne i64 %indvars.iv.next8, 128
br i1 %exitcond10, label %for.body12, label %for.inc40
for.inc40: ; preds = %for.inc37
%indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
- %exitcond13 = icmp ne i64 %indvars.iv.next12, 1024
+ %exitcond13 = icmp ne i64 %indvars.iv.next12, 128
br i1 %exitcond13, label %for.body8, label %for.inc43
for.inc43: ; preds = %for.inc40
%indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1
- %exitcond17 = icmp ne i64 %indvars.iv.next15, 1024
+ %exitcond17 = icmp ne i64 %indvars.iv.next15, 128
br i1 %exitcond17, label %for.body4, label %for.inc46
for.inc46: ; preds = %for.inc43
%indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
- %exitcond21 = icmp ne i64 %indvars.iv.next19, 1023
+ %exitcond21 = icmp ne i64 %indvars.iv.next19, 127
br i1 %exitcond21, label %for.body, label %for.end48
for.end48: ; preds = %for.inc46
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
index c843337e0c1e1..2b317c02b7fb8 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -14,9 +14,9 @@ target triple = "powerpc64le-unknown-linux-gnu"
; y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m]
; }
-; CHECK: Loop 'k_loop' has cost = 30000000000
-; CHECK: Loop 'j_loop' has cost = 30000000000
-; CHECK: Loop 'i_loop' has cost = 30000000000
+; CHECK: Loop 'k_loop' has cost = 10200000000000000
+; CHECK: Loop 'j_loop' has cost = 102000000000000
+; CHECK: Loop 'i_loop' has cost = 1020000000000
; CHECK: Loop 'm_loop' has cost = 10700000000
; CHECK: Loop 'l_loop' has cost = 1300000000
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
new file mode 100644
index 0000000000000..89d959e927840
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -opaque-pointers -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-DAG: Loop 'for.j' has cost = 201000000
+; CHECK-DAG: Loop 'for.i' has cost = 102000000
+; CHECK-DAG: Loop 'for.k' has cost = 90000
+
+;; Test to make sure when we have multiple conflicting access patterns, the
+;; chosen loop configuration favours the majority of those accesses.
+;; For example this nest should be ordered as j-i-k.
+;; for (int i = 0; i < n; i++)
+;; for (int j = 0; j < n; j++)
+;; for (int k = 0; k < n; k++) {
+;; A[i][j][k] = 1;
+;; B[j][i][k] = 2;
+;; C[j][i][k] = 3;
+;; }
+
+define void @foo(i32 noundef signext %n, ptr noalias noundef %A, ptr noalias noundef %B, ptr noalias noundef %C) {
+entry:
+ %0 = zext i32 %n to i64
+ %1 = zext i32 %n to i64
+ %2 = zext i32 %n to i64
+ %3 = zext i32 %n to i64
+ %4 = zext i32 %n to i64
+ %5 = zext i32 %n to i64
+ %cmp5 = icmp sgt i32 %n, 0
+ br i1 %cmp5, label %for.i.preheader, label %for.end30
+
+for.i.preheader: ; preds = %entry
+ %wide.trip.count16 = zext i32 %n to i64
+ br label %for.i
+
+for.i: ; preds = %for.i.preheader, %for.inc28
+ %indvars.iv13 = phi i64 [ 0, %for.i.preheader ], [ %indvars.iv.next14, %for.inc28 ]
+ %cmp23 = icmp sgt i32 %n, 0
+ br i1 %cmp23, label %for.j.preheader, label %for.inc28
+
+for.j.preheader: ; preds = %for.i
+ %wide.trip.count11 = zext i32 %n to i64
+ br label %for.j
+
+for.j: ; preds = %for.j.preheader, %for.inc25
+ %indvars.iv8 = phi i64 [ 0, %for.j.preheader ], [ %indvars.iv.next9, %for.inc25 ]
+ %cmp61 = icmp sgt i32 %n, 0
+ br i1 %cmp61, label %for.k.preheader, label %for.inc25
+
+for.k.preheader: ; preds = %for.j
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.k
+
+for.k: ; preds = %for.k.preheader, %for.k
+ %indvars.iv = phi i64 [ 0, %for.k.preheader ], [ %indvars.iv.next, %for.k ]
+ %6 = mul nuw i64 %0, %1
+ %7 = mul nsw i64 %6, %indvars.iv13
+ %arrayidx = getelementptr inbounds i32, ptr %A, i64 %7
+ %8 = mul nuw nsw i64 %indvars.iv8, %1
+ %arrayidx10 = getelementptr inbounds i32, ptr %arrayidx, i64 %8
+ %arrayidx12 = getelementptr inbounds i32, ptr %arrayidx10, i64 %indvars.iv
+ store i32 1, ptr %arrayidx12, align 4
+ %9 = mul nuw i64 %2, %3
+ %10 = mul nsw i64 %9, %indvars.iv8
+ %arrayidx14 = getelementptr inbounds i32, ptr %B, i64 %10
+ %11 = mul nuw nsw i64 %indvars.iv13, %3
+ %arrayidx16 = getelementptr inbounds i32, ptr %arrayidx14, i64 %11
+ %arrayidx18 = getelementptr inbounds i32, ptr %arrayidx16, i64 %indvars.iv
+ store i32 2, ptr %arrayidx18, align 4
+ %12 = mul nuw i64 %4, %5
+ %13 = mul nsw i64 %12, %indvars.iv8
+ %arrayidx20 = getelementptr inbounds i32, ptr %C, i64 %13
+ %14 = mul nuw nsw i64 %indvars.iv13, %5
+ %arrayidx22 = getelementptr inbounds i32, ptr %arrayidx20, i64 %14
+ %arrayidx24 = getelementptr inbounds i32, ptr %arrayidx22, i64 %indvars.iv
+ store i32 3, ptr %arrayidx24, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.k, label %for.inc25.loopexit
+
+for.inc25.loopexit: ; preds = %for.k
+ br label %for.inc25
+
+for.inc25: ; preds = %for.inc25.loopexit, %for.j
+ %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
+ %exitcond12 = icmp ne i64 %indvars.iv.next9, %wide.trip.count11
+ br i1 %exitcond12, label %for.j, label %for.inc28.loopexit
+
+for.inc28.loopexit: ; preds = %for.inc25
+ br label %for.inc28
+
+for.inc28: ; preds = %for.inc28.loopexit, %for.i
+ %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+ %exitcond17 = icmp ne i64 %indvars.iv.next14, %wide.trip.count16
+ br i1 %exitcond17, label %for.i, label %for.end30.loopexit
+
+for.end30.loopexit: ; preds = %for.inc28
+ br label %for.end30
+
+for.end30: ; preds = %for.end30.loopexit, %entry
+ ret void
+}
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
index 23bd1e1fdbc9f..8aa247b867385 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -10,7 +10,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
; A[2*i+3][3*j-4][2*k+7] = 1;
; }
-; CHECK: Loop 'for.i' has cost = 1000000
+; CHECK: Loop 'for.i' has cost = 100000000
; CHECK: Loop 'for.j' has cost = 1000000
; CHECK: Loop 'for.k' has cost = 60000
More information about the llvm-commits
mailing list