[llvm] 6702594 - [LoopCacheAnalysis] Fix loop cache cost to always round the cost up to the nearest integer number (#88915)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 27 06:54:42 PDT 2024
Author: Rouzbeh
Date: 2024-05-27T09:54:39-04:00
New Revision: 670259466b238176ac302c8dedf806d2b2be7e0c
URL: https://github.com/llvm/llvm-project/commit/670259466b238176ac302c8dedf806d2b2be7e0c
DIFF: https://github.com/llvm/llvm-project/commit/670259466b238176ac302c8dedf806d2b2be7e0c.diff
LOG: [LoopCacheAnalysis] Fix loop cache cost to always round the cost up to the nearest integer number (#88915)
Currently loop cache analysis uses following formula to evaluate cost of
an RefGroup for a consecutive memory access:
`RefCost=(TripCount*Stride)/CLS`
This cost evaluates to zero when `TripCount*Stride` is smaller than
cache-line-size. This results in wrong cost value for a loop and
misleads loopInterchange decisions as shown in [this
case](https://llvm.godbolt.org/z/jTz1vn4hn).
This patch fixes the problem by rounding the cost to 1 once this problem
happens.
Added:
llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
Modified:
llvm/lib/Analysis/LoopCacheAnalysis.cpp
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
Removed:
################################################################################
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 284d8d16d264e..7ca9f15ad5fca 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -299,7 +299,12 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
- RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+ // Round the fractional cost up to the nearest integer number.
+ // The impact is the most significant when cost is calculated
+ // to be a number less than one, because it makes more sense
+ // to say one cache line is used rather than zero cache line
+ // is used.
+ RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
LLVM_DEBUG(dbgs().indent(4)
<< "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index e15f06843500e..5209d290c83da 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -7,7 +7,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
;; #define N 1024
;; #define M 2048
@@ -49,7 +49,7 @@ for.end13: ; preds = %for.inc11
; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
define void @t2(ptr %a) {
entry:
@@ -87,7 +87,7 @@ declare ptr @func_with_returned_arg(ptr returned %arg)
; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
-; CHECK-NEXT: Loop 'for.body16' has cost = 32260032000
+; CHECK-NEXT: Loop 'for.body16' has cost = 32772096000
;; #define N 128
;; #define M 2048
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
index 87f522c982544..7275d04c92b47 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
@@ -38,7 +38,7 @@ for.end: ; preds = %for.cond
; CHECK: Loop 'for.cond' has cost = 100000000
; CHECK: Loop 'for.cond1' has cost = 1000000
-; CHECK: Loop 'for.cond5' has cost = 30000
+; CHECK: Loop 'for.cond5' has cost = 40000
@data = external dso_local global [2 x [4 x [18 x i32]]], align 1
@@ -118,7 +118,7 @@ for.neg.end: ; preds = %for.neg.cond
; access functions. When this is fixed this testcase should have a cost
; approximately 2x higher.
-; CHECK: Loop 'for.cond2' has cost = 2560
+; CHECK: Loop 'for.cond2' has cost = 2561
define void @Test2(ptr %B) {
entry:
br label %for.cond2
@@ -148,7 +148,7 @@ for.end: ; preds = %for.cond
; for (i = 40960; i > 0; i--)
; C[i] = C[i];
-; CHECK: Loop 'for.cond3' has cost = 2560
+; CHECK: Loop 'for.cond3' has cost = 2561
define void @Test3(ptr %C) {
entry:
br label %for.cond3
@@ -177,7 +177,7 @@ for.end: ; preds = %for.cond
; for (i = 0; i < 40960; i++)
; D[i] = D[i];
-; CHECK: Loop 'for.cond4' has cost = 2560
+; CHECK: Loop 'for.cond4' has cost = 2561
define void @Test4(ptr %D) {
entry:
br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
index 39fe382a41196..efb1d907605a8 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
; }
; CHECK: Loop 'for.i' has cost = 3000000
-; CHECK-NEXT: Loop 'for.k' has cost = 2030000
-; CHECK-NEXT: Loop 'for.j' has cost = 1060000
+; CHECK-NEXT: Loop 'for.k' has cost = 2040000
+; CHECK-NEXT: Loop 'for.j' has cost = 1080000
define void @foo(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
index 9538c3c93538a..0e8a25ffb1cac 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
; }
; CHECK:Loop 'for.i' has cost = 2010000
-; CHECK-NEXT:Loop 'for.k' has cost = 1040000
-; CHECK-NEXT:Loop 'for.j' has cost = 70000
+; CHECK-NEXT:Loop 'for.k' has cost = 1050000
+; CHECK-NEXT:Loop 'for.j' has cost = 90000
define void @matmul(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
index 7bbbe43f5a2fc..bf5425881ce3b 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -17,8 +17,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
; CHECK: Loop 'k_loop' has cost = 10200000000000000
; CHECK-NEXT: Loop 'j_loop' has cost = 102000000000000
; CHECK-NEXT: Loop 'i_loop' has cost = 1020000000000
-; CHECK-NEXT: Loop 'm_loop' has cost = 10700000000
-; CHECK-NEXT: Loop 'l_loop' has cost = 1300000000
+; CHECK-NEXT: Loop 'm_loop' has cost = 10800000000
+; CHECK-NEXT: Loop 'l_loop' has cost = 1500000000
%_elem_type_of_double = type <{ double }>
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
index 63425c7ecef40..b6c2497d45b9b 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -5,7 +5,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
; CHECK: Loop 'for.j' has cost = 201000000
; CHECK-NEXT: Loop 'for.i' has cost = 102000000
-; CHECK-NEXT: Loop 'for.k' has cost = 90000
+; CHECK-NEXT: Loop 'for.k' has cost = 120000
;; Test to make sure when we have multiple conflicting access patterns, the
;; chosen loop configuration favours the majority of those accesses.
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
index f583822579cf9..9aa048489bd38 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -12,7 +12,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
; CHECK: Loop 'for.i' has cost = 100000000
; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
define void @foo(i64 %n, i64 %m, i64 %o, ptr %A) {
entry:
@@ -90,7 +90,7 @@ for.end: ; preds = %for.end.loopexit, %
; CHECK: Loop 'for.i' has cost = 100000000
; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
define void @foo2(i64 %n, i64 %m, i64 %o, ptr %A) {
entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
index b79a47aed1ef0..a4be5ba5dbf0b 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
; }
; }
-; CHECK: Loop 'for.i' has cost = 20600
-; CHECK-NEXT: Loop 'for.j' has cost = 800
+; CHECK: Loop 'for.i' has cost = 20800
+; CHECK-NEXT: Loop 'for.j' has cost = 1000
define void @foo(i64 %n, i64 %m, ptr %A, ptr %B, ptr %C) {
entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
index d979645bef579..205cd851fce0d 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
@@ -8,6 +8,9 @@
; Check IndexedReference::computeRefCost can handle type
diff erences between
; Stride and TripCount
+; Round costs up to the nearest whole number i.e. in 'for.cond5' cost is calculated 12.5 and
+; it makes more sense to say 13 cache lines are used rather than 12 cache lines.
+
; SMALLER-CACHELINE: Loop 'for.cond' has cost = 256
; LARGER-CACHELINE: Loop 'for.cond' has cost = 32
%struct._Handleitem = type { ptr }
@@ -40,10 +43,10 @@ for.end: ; preds = %for.cond
; SMALLER-CACHELINE: Loop 'for.cond' has cost = 100000000
; SMALLER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 120000
+; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 130000
; LARGER-CACHELINE: Loop 'for.cond' has cost = 100000000
; LARGER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; LARGER-CACHELINE: Loop 'for.cond5' has cost = 10000
+; LARGER-CACHELINE: Loop 'for.cond5' has cost = 20000
@data = external dso_local global [2 x [4 x [18 x i32]]], align 1
define dso_local void @handle_to_ptr_2(i1 %b0, i1 %b1, i1 %b2) {
@@ -122,8 +125,8 @@ for.neg.end: ; preds = %for.neg.cond
; access functions. When this is fixed this testcase should have a cost
; approximately 2x higher.
-; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1281
define void @Test2(ptr %B) {
entry:
br label %for.cond2
@@ -153,8 +156,8 @@ for.end: ; preds = %for.cond
; for (i = 40960; i > 0; i--)
; C[i] = C[i];
-; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1281
define void @Test3(ptr %C) {
entry:
br label %for.cond3
@@ -183,8 +186,8 @@ for.end: ; preds = %for.cond
; for (i = 0; i < 40960; i++)
; D[i] = D[i];
-; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1281
define void @Test4(ptr %D) {
entry:
br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
new file mode 100644
index 0000000000000..3086224c58204
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -cache-line-size=64 -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+;; This test checks the effect of rounding cache cost to 1 when it is
+;; evaluated to 0 because at least 1 cache line is accessed by the loopnest.
+;; It does not make sense to output that zero cache lines are used.
+;; The cost of reference group for B[j], C[j], D[j] and E[j] were
+;; calculted 0 before but now they are 1 which makes each loop cost more reasonable.
+;
+; void test(int n, int m, int o, int A[2][3], int B[2], int C[2], int D[2], int E[2]) {
+; for (int i = 0; i < 3; i++)
+; for (int j = 0; j < 2; j++)
+; A[j][i] = 1;
+; B[j] = 1;
+; C[j] = 1;
+; D[j] = 1
+; E[j] = 1
+; }
+
+; CHECK: Loop 'for.j' has cost = 18
+; CHECK-NEXT: Loop 'for.i' has cost = 10
+
+define void @test(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E) {
+
+entry:
+ br label %for.i.preheader.split
+
+for.i.preheader.split: ; preds = %for.i.preheader
+ br label %for.i
+
+for.i: ; preds = %for.inci, %for.i.preheader.split
+ %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+ br label %for.j
+
+for.j: ; preds = %for.incj, %for.i
+ %j = phi i64 [ %incj, %for.j ], [ 0, %for.i ]
+ %mul_j = mul nsw i64 %j, 3
+ %index_j = add i64 %mul_j, %i
+ %arrayidxA = getelementptr inbounds [2 x [ 3 x i32]], ptr %A, i64 %j, i64 %i
+ store i32 1, ptr %arrayidxA, align 4
+ %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %j
+ store i32 1, ptr %arrayidxB, align 4
+ %arrayidxC = getelementptr inbounds i32, ptr %C, i64 %j
+ store i32 1, ptr %arrayidxC, align 4
+ %arrayidxD = getelementptr inbounds i32, ptr %D, i64 %j
+ store i32 1, ptr %arrayidxD, align 4
+ %arrayidxE = getelementptr inbounds i32, ptr %E, i64 %j
+ store i32 1, ptr %arrayidxE, align 4
+ %incj = add nsw i64 %j, 1
+ %exitcond.us = icmp eq i64 %incj, 2
+ br i1 %exitcond.us, label %for.inci, label %for.j
+
+for.inci: ; preds = %for.incj
+ %inci = add nsw i64 %i, 1
+ %exitcond55.us = icmp eq i64 %inci, 3
+ br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit: ; preds = %for.inci
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
index 965d95110da46..cc787fa55600a 100644
--- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,42 +1,25 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
@b = external dso_local global [5 x i32], align 16
+;; Not profitable to interchange, because the access is invariant to j loop.
+;;
+;; for(int i=0;i<4;i++) {
+;; for(int j=1;j<4;j++) {
+;; b[i] = ....
+;; }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: InterchangeNotProfitable
+; CHECK-NEXT: Function: test1
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+
define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.body2.preheader:
-; CHECK-NEXT: br label [[FOR_BODY2:%.*]]
-; CHECK: for.body2:
-; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY_PREHEADER]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: store i32 undef, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.inc.split:
-; CHECK-NEXT: [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: br label [[FOR_INC3]]
-; CHECK: for.inc3:
-; CHECK-NEXT: [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK: for.cond.for.end5_crit_edge:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
@@ -68,41 +51,15 @@ for.cond.for.end5_crit_edge: ; preds = %for.inc3
ret void
}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: InterchangeNotProfitable
+; CHECK-NEXT: Function: test2
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+
define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.body2.preheader:
-; CHECK-NEXT: br label [[FOR_BODY2:%.*]]
-; CHECK: for.body2:
-; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY_PREHEADER]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: [[CMP_ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT: store i32 [[CMP_ZEXT]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.inc.split:
-; CHECK-NEXT: [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: br label [[FOR_INC3]]
-; CHECK: for.inc3:
-; CHECK-NEXT: [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK: for.cond.for.end5_crit_edge:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
More information about the llvm-commits
mailing list