[llvm] Fix loop cache cost to avoid cost of zero for refgroups. (PR #88915)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 12:22:54 PDT 2024
https://github.com/RouzbehPaktinat updated https://github.com/llvm/llvm-project/pull/88915
>From 2b6692af6bdc957f2faa40f5e8bc20037bcc19b6 Mon Sep 17 00:00:00 2001
From: RouzbehPaktinat <rouzbeh.paktinat1 at huawei.com>
Date: Wed, 17 Apr 2024 15:20:36 -0400
Subject: [PATCH] Fix loop cache cost to avoid cost of zero for refgroups.
Currently loop cache analysis uses following formula to evaluate cost of an RefGroup for a consecutive memory access: "RefCost=(TripCount*Stride)/CLS" .The cost evaluates to zero when "TripCount*Strid" is smaller than cache-line-size. This could result in wrong cost value for a loop and misleading loopInterchange decision.
This patch fixes the problem by rounding the cost to 1 once this problem happens.
---
llvm/lib/Analysis/LoopCacheAnalysis.cpp | 6 +-
.../interchange-cost-beneficial.ll | 62 +++++++++++++
.../pr43176-move-to-new-latch.ll | 93 +++++--------------
3 files changed, 92 insertions(+), 69 deletions(-)
create mode 100644 llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 284d8d16d264e9..24dcdf52972590 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -299,7 +299,11 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
- RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+ // When result is zero, round it to one because at least one cache line must
+ // be used. It does not make sense to output the result that zero cache line
+ // is used.
+ if (RefCost->isZero())
+ RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
LLVM_DEBUG(dbgs().indent(4)
<< "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
new file mode 100644
index 00000000000000..3086224c582048
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -cache-line-size=64 -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+;; This test checks the effect of rounding cache cost to 1 when it is
+;; evaluated to 0 because at least 1 cache line is accessed by the loopnest.
+;; It does not make sense to output that zero cache lines are used.
+;; The cost of reference group for B[j], C[j], D[j] and E[j] were
+;; calculted 0 before but now they are 1 which makes each loop cost more reasonable.
+;
+; void test(int n, int m, int o, int A[2][3], int B[2], int C[2], int D[2], int E[2]) {
+; for (int i = 0; i < 3; i++)
+; for (int j = 0; j < 2; j++)
+; A[j][i] = 1;
+; B[j] = 1;
+; C[j] = 1;
+; D[j] = 1
+; E[j] = 1
+; }
+
+; CHECK: Loop 'for.j' has cost = 18
+; CHECK-NEXT: Loop 'for.i' has cost = 10
+
+define void @test(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E) {
+
+entry:
+ br label %for.i.preheader.split
+
+for.i.preheader.split: ; preds = %for.i.preheader
+ br label %for.i
+
+for.i: ; preds = %for.inci, %for.i.preheader.split
+ %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+ br label %for.j
+
+for.j: ; preds = %for.incj, %for.i
+ %j = phi i64 [ %incj, %for.j ], [ 0, %for.i ]
+ %mul_j = mul nsw i64 %j, 3
+ %index_j = add i64 %mul_j, %i
+ %arrayidxA = getelementptr inbounds [2 x [ 3 x i32]], ptr %A, i64 %j, i64 %i
+ store i32 1, ptr %arrayidxA, align 4
+ %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %j
+ store i32 1, ptr %arrayidxB, align 4
+ %arrayidxC = getelementptr inbounds i32, ptr %C, i64 %j
+ store i32 1, ptr %arrayidxC, align 4
+ %arrayidxD = getelementptr inbounds i32, ptr %D, i64 %j
+ store i32 1, ptr %arrayidxD, align 4
+ %arrayidxE = getelementptr inbounds i32, ptr %E, i64 %j
+ store i32 1, ptr %arrayidxE, align 4
+ %incj = add nsw i64 %j, 1
+ %exitcond.us = icmp eq i64 %incj, 2
+ br i1 %exitcond.us, label %for.inci, label %for.j
+
+for.inci: ; preds = %for.incj
+ %inci = add nsw i64 %i, 1
+ %exitcond55.us = icmp eq i64 %inci, 3
+ br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit: ; preds = %for.inci
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
index 965d95110da466..cc787fa55600a6 100644
--- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,42 +1,25 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
@b = external dso_local global [5 x i32], align 16
+;; Not profitable to interchange, because the access is invariant to j loop.
+;;
+;; for(int i=0;i<4;i++) {
+;; for(int j=1;j<4;j++) {
+;; b[i] = ....
+;; }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: InterchangeNotProfitable
+; CHECK-NEXT: Function: test1
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+
define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.body2.preheader:
-; CHECK-NEXT: br label [[FOR_BODY2:%.*]]
-; CHECK: for.body2:
-; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY_PREHEADER]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: store i32 undef, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.inc.split:
-; CHECK-NEXT: [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: br label [[FOR_INC3]]
-; CHECK: for.inc3:
-; CHECK-NEXT: [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK: for.cond.for.end5_crit_edge:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
@@ -68,41 +51,15 @@ for.cond.for.end5_crit_edge: ; preds = %for.inc3
ret void
}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: InterchangeNotProfitable
+; CHECK-NEXT: Function: test2
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+
define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.body2.preheader:
-; CHECK-NEXT: br label [[FOR_BODY2:%.*]]
-; CHECK: for.body2:
-; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY_PREHEADER]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: [[CMP_ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT: store i32 [[CMP_ZEXT]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.inc.split:
-; CHECK-NEXT: [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: br label [[FOR_INC3]]
-; CHECK: for.inc3:
-; CHECK-NEXT: [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK: for.cond.for.end5_crit_edge:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
More information about the llvm-commits
mailing list