[llvm] Fix loop cache cost to avoid cost of zero for refgroups. (PR #88915)

Wed Apr 17 12:22:54 PDT 2024

https://github.com/RouzbehPaktinat updated https://github.com/llvm/llvm-project/pull/88915

>From 2b6692af6bdc957f2faa40f5e8bc20037bcc19b6 Mon Sep 17 00:00:00 2001
From: RouzbehPaktinat <rouzbeh.paktinat1 at huawei.com>
Date: Wed, 17 Apr 2024 15:20:36 -0400
Subject: [PATCH] Fix loop cache cost to avoid cost of zero for refgroups.

 Currently loop cache analysis uses following formula to evaluate cost of an RefGroup for a consecutive memory access: "RefCost=(TripCount*Stride)/CLS" .The cost evaluates to zero when "TripCount*Strid" is smaller than cache-line-size. This could result in wrong cost value for a loop and misleading loopInterchange decision.

 This patch fixes the problem by rounding the cost to 1 once this problem happens.
---
 llvm/lib/Analysis/LoopCacheAnalysis.cpp       |  6 +-
 .../interchange-cost-beneficial.ll            | 62 +++++++++++++
 .../pr43176-move-to-new-latch.ll              | 93 +++++--------------
 3 files changed, 92 insertions(+), 69 deletions(-)
 create mode 100644 llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll

diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 284d8d16d264e9..24dcdf52972590 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -299,7 +299,11 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
     TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
-    RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+    // When result is zero, round it to one because at least one cache line must
+    // be used. It does not make sense to output the result that zero cache line
+    // is used.
+    if (RefCost->isZero())
+      RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
 
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
new file mode 100644
index 00000000000000..3086224c582048
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
@@ -0,0 +1,62 @@
+; RUN: opt <  %s  -cache-line-size=64 -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck  %s
+
+;; This test checks the effect of rounding cache cost to 1 when it is 
+;; evaluated to 0 because at least 1 cache line is accessed by the loopnest.
+;; It does not make sense to output that zero cache lines are used.
+;; The cost of reference group for B[j], C[j], D[j] and E[j] were 
+;; calculted 0 before but now they are 1 which makes each loop cost more reasonable.
+;
+; void test(int n, int m, int o, int A[2][3], int B[2], int C[2], int D[2], int E[2]) {
+;   for (int i = 0; i < 3; i++)
+;     for (int j = 0; j < 2; j++)
+;        A[j][i] = 1;
+;        B[j] = 1;
+;        C[j] = 1;
+;        D[j] = 1
+;        E[j] = 1
+; }
+
+; CHECK: Loop 'for.j' has cost = 18
+; CHECK-NEXT: Loop 'for.i' has cost = 10
+
+define void @test(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E) {
+
+entry:
+  br label %for.i.preheader.split
+
+for.i.preheader.split:                            ; preds = %for.i.preheader
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.j ], [ 0, %for.i ]
+  %mul_j = mul nsw i64 %j, 3
+  %index_j = add i64 %mul_j, %i
+  %arrayidxA = getelementptr inbounds [2 x [ 3 x i32]], ptr %A, i64 %j, i64 %i
+  store i32 1, ptr %arrayidxA, align 4
+  %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %j
+  store i32 1, ptr %arrayidxB, align 4
+  %arrayidxC = getelementptr inbounds i32, ptr %C, i64 %j
+  store i32 1, ptr %arrayidxC, align 4
+  %arrayidxD = getelementptr inbounds i32, ptr %D, i64 %j
+  store i32 1, ptr %arrayidxD, align 4
+  %arrayidxE = getelementptr inbounds i32, ptr %E, i64 %j
+  store i32 1, ptr %arrayidxE, align 4
+  %incj = add nsw i64 %j, 1
+  %exitcond.us = icmp eq i64 %incj, 2
+  br i1 %exitcond.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, 3
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
index 965d95110da466..cc787fa55600a6 100644
--- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,42 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
 
 @b = external dso_local global [5 x i32], align 16
 
+;; Not profitable to interchange, because the access is invariant to j loop.
+;;
+;;  for(int i=0;i<4;i++) {
+;;    for(int j=1;j<4;j++) {
+;;      b[i] = ....
+;;    }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test1
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store i32 undef, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -68,41 +51,15 @@ for.cond.for.end5_crit_edge:                      ; preds = %for.inc3
   ret void
 }
 
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[CMP_ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store i32 [[CMP_ZEXT]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body