[llvm] [LoopCacheAnalysis] Fix loop cache cost to always round the cost up to the nearest integer number (PR #88915)

Thu May 2 17:39:16 PDT 2024

https://github.com/Narutoworld updated https://github.com/llvm/llvm-project/pull/88915

>From 7227448c07da476fc7d63c834518af2eff4b2c8d Mon Sep 17 00:00:00 2001
From: RouzbehPaktinat <rouzbeh.paktinat1 at huawei.com>
Date: Wed, 24 Apr 2024 10:31:37 -0400
Subject: [PATCH] [LoopCacheAnalysis] Fix loop cache cost to always round the
 cost up to the nearest integer number.

Currently loop cache analysis uses following formula to evaluate cost of an RefGroup for a consecutive memory access: "RefCost=(TripCount*Stride)/CLS". When the cost is fractional, it's always rounded to the smaller integer number but this is problematic specially when the cost is calculated to be a number less than one which will be rounded to zero. But it makes more sense to say one cache line is used in this case rather than zero cache lines.

This patch fixes the problem by always rounding the cost up to the nearest larger integer number.
---
 llvm/lib/Analysis/LoopCacheAnalysis.cpp       |  7 +-
 .../PowerPC/LoopnestFixedSize.ll              |  6 +-
 .../LoopCacheAnalysis/PowerPC/compute-cost.ll |  8 +-
 .../LoopCacheAnalysis/PowerPC/loads-store.ll  |  4 +-
 .../LoopCacheAnalysis/PowerPC/matmul.ll       |  4 +-
 .../LoopCacheAnalysis/PowerPC/matvecmul.ll    |  4 +-
 .../LoopCacheAnalysis/PowerPC/multi-store.ll  |  2 +-
 .../LoopCacheAnalysis/PowerPC/single-store.ll |  4 +-
 .../LoopCacheAnalysis/PowerPC/stencil.ll      |  4 +-
 .../LoopCacheAnalysis/compute-cost.ll         | 19 ++--
 .../interchange-cost-beneficial.ll            | 62 +++++++++++++
 .../pr43176-move-to-new-latch.ll              | 93 +++++--------------
 12 files changed, 122 insertions(+), 95 deletions(-)
 create mode 100644 llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll

diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 284d8d16d264e9..7ca9f15ad5fca0 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -299,7 +299,12 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
     TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
-    RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+    // Round the fractional cost up to the nearest integer number.
+    // The impact is the most significant when cost is calculated
+    // to be a number less than one, because it makes more sense
+    // to say one cache line is used rather than zero cache line
+    // is used.
+    RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
 
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index e15f06843500e5..5209d290c83daa 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -7,7 +7,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
 
 ;; #define N 1024
 ;; #define M 2048
@@ -49,7 +49,7 @@ for.end13:                                        ; preds = %for.inc11
 
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
 
 define void @t2(ptr %a) {
 entry:
@@ -87,7 +87,7 @@ declare ptr @func_with_returned_arg(ptr returned %arg)
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
-; CHECK-NEXT: Loop 'for.body16' has cost = 32260032000
+; CHECK-NEXT: Loop 'for.body16' has cost = 32772096000
 
 ;; #define N 128
 ;; #define M 2048
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
index 87f522c982544e..7275d04c92b472 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
@@ -38,7 +38,7 @@ for.end:                                          ; preds = %for.cond
 
 ; CHECK: Loop 'for.cond' has cost = 100000000
 ; CHECK: Loop 'for.cond1' has cost = 1000000
-; CHECK: Loop 'for.cond5' has cost = 30000
+; CHECK: Loop 'for.cond5' has cost = 40000
 
 @data = external dso_local global [2 x [4 x [18 x i32]]], align 1
 
@@ -118,7 +118,7 @@ for.neg.end:                                          ; preds = %for.neg.cond
 ; access functions. When this is fixed this testcase should have a cost
 ; approximately 2x higher.
 
-; CHECK: Loop 'for.cond2' has cost = 2560
+; CHECK: Loop 'for.cond2' has cost = 2561
 define void @Test2(ptr %B) {
 entry:
   br label %for.cond2
@@ -148,7 +148,7 @@ for.end:                                          ; preds = %for.cond
 ;   for (i = 40960; i > 0; i--)
 ;     C[i] = C[i];
 
-; CHECK: Loop 'for.cond3' has cost = 2560
+; CHECK: Loop 'for.cond3' has cost = 2561
 define void @Test3(ptr %C) {
 entry:
   br label %for.cond3
@@ -177,7 +177,7 @@ for.end:                                          ; preds = %for.cond
 ;  for (i = 0; i < 40960; i++)
 ;     D[i] = D[i];
 
-; CHECK: Loop 'for.cond4' has cost = 2560
+; CHECK: Loop 'for.cond4' has cost = 2561
 define void @Test4(ptr %D) {
 entry:
   br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
index 39fe382a41196a..efb1d907605a82 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; }
 
 ; CHECK: Loop 'for.i' has cost = 3000000
-; CHECK-NEXT: Loop 'for.k' has cost = 2030000
-; CHECK-NEXT: Loop 'for.j' has cost = 1060000
+; CHECK-NEXT: Loop 'for.k' has cost = 2040000
+; CHECK-NEXT: Loop 'for.j' has cost = 1080000
 
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
index 9538c3c93538aa..0e8a25ffb1cac5 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; }
 
 ; CHECK:Loop 'for.i' has cost = 2010000
-; CHECK-NEXT:Loop 'for.k' has cost = 1040000
-; CHECK-NEXT:Loop 'for.j' has cost = 70000
+; CHECK-NEXT:Loop 'for.k' has cost = 1050000
+; CHECK-NEXT:Loop 'for.j' has cost = 90000
     
 define void @matmul(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
index 7bbbe43f5a2fcd..bf5425881ce3b9 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -17,8 +17,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; CHECK: Loop 'k_loop' has cost = 10200000000000000
 ; CHECK-NEXT: Loop 'j_loop' has cost = 102000000000000
 ; CHECK-NEXT: Loop 'i_loop' has cost = 1020000000000
-; CHECK-NEXT: Loop 'm_loop' has cost = 10700000000
-; CHECK-NEXT: Loop 'l_loop' has cost = 1300000000
+; CHECK-NEXT: Loop 'm_loop' has cost = 10800000000
+; CHECK-NEXT: Loop 'l_loop' has cost = 1500000000
 
 %_elem_type_of_double = type <{ double }>
 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
index 63425c7ecef40d..b6c2497d45b9b6 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -5,7 +5,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 
 ; CHECK: Loop 'for.j' has cost = 201000000
 ; CHECK-NEXT: Loop 'for.i' has cost = 102000000
-; CHECK-NEXT: Loop 'for.k' has cost = 90000
+; CHECK-NEXT: Loop 'for.k' has cost = 120000
 
 ;; Test to make sure when we have multiple conflicting access patterns, the 
 ;; chosen loop configuration favours the majority of those accesses. 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
index f583822579cf90..9aa048489bd382 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -12,7 +12,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 
 ; CHECK: Loop 'for.i' has cost = 100000000
 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
 
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A) {
 entry:
@@ -90,7 +90,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; CHECK: Loop 'for.i' has cost = 100000000
 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
 
 define void @foo2(i64 %n, i64 %m, i64 %o, ptr %A) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
index b79a47aed1ef08..a4be5ba5dbf0b3 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ;     }   
 ; }
 
-; CHECK: Loop 'for.i' has cost = 20600
-; CHECK-NEXT: Loop 'for.j' has cost = 800
+; CHECK: Loop 'for.i' has cost = 20800
+; CHECK-NEXT: Loop 'for.j' has cost = 1000
 
 define void @foo(i64 %n, i64 %m, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
index d979645bef579c..205cd851fce0de 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
@@ -8,6 +8,9 @@
 ; Check IndexedReference::computeRefCost can handle type differences between
 ; Stride and TripCount
 
+; Round costs up to the nearest whole number i.e. in 'for.cond5' cost is calculated 12.5 and
+; it makes more sense to say 13 cache lines are used rather than 12 cache lines.
+
 ; SMALLER-CACHELINE: Loop 'for.cond' has cost = 256
 ; LARGER-CACHELINE: Loop 'for.cond' has cost = 32
 %struct._Handleitem = type { ptr }
@@ -40,10 +43,10 @@ for.end:                                          ; preds = %for.cond
 
 ; SMALLER-CACHELINE: Loop 'for.cond' has cost = 100000000
 ; SMALLER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 120000
+; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 130000
 ; LARGER-CACHELINE: Loop 'for.cond' has cost = 100000000
 ; LARGER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; LARGER-CACHELINE: Loop 'for.cond5' has cost = 10000
+; LARGER-CACHELINE: Loop 'for.cond5' has cost = 20000
 @data = external dso_local global [2 x [4 x [18 x i32]]], align 1
 
 define dso_local void @handle_to_ptr_2(i1 %b0, i1 %b1, i1 %b2) {
@@ -122,8 +125,8 @@ for.neg.end:                                          ; preds = %for.neg.cond
 ; access functions. When this is fixed this testcase should have a cost
 ; approximately 2x higher.
 
-; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1281
 define void @Test2(ptr %B) {
 entry:
   br label %for.cond2
@@ -153,8 +156,8 @@ for.end:                                          ; preds = %for.cond
 ;   for (i = 40960; i > 0; i--)
 ;     C[i] = C[i];
 
-; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1281
 define void @Test3(ptr %C) {
 entry:
   br label %for.cond3
@@ -183,8 +186,8 @@ for.end:                                          ; preds = %for.cond
 ;  for (i = 0; i < 40960; i++)
 ;     D[i] = D[i];
 
-; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1281
 define void @Test4(ptr %D) {
 entry:
   br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
new file mode 100644
index 00000000000000..3086224c582048
--- /dev/null
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
@@ -0,0 +1,62 @@
+; RUN: opt <  %s  -cache-line-size=64 -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck  %s
+
+;; This test checks the effect of rounding cache cost to 1 when it is 
+;; evaluated to 0 because at least 1 cache line is accessed by the loopnest.
+;; It does not make sense to output that zero cache lines are used.
+;; The cost of reference group for B[j], C[j], D[j] and E[j] were 
+;; calculted 0 before but now they are 1 which makes each loop cost more reasonable.
+;
+; void test(int n, int m, int o, int A[2][3], int B[2], int C[2], int D[2], int E[2]) {
+;   for (int i = 0; i < 3; i++)
+;     for (int j = 0; j < 2; j++)
+;        A[j][i] = 1;
+;        B[j] = 1;
+;        C[j] = 1;
+;        D[j] = 1
+;        E[j] = 1
+; }
+
+; CHECK: Loop 'for.j' has cost = 18
+; CHECK-NEXT: Loop 'for.i' has cost = 10
+
+define void @test(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E) {
+
+entry:
+  br label %for.i.preheader.split
+
+for.i.preheader.split:                            ; preds = %for.i.preheader
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.j ], [ 0, %for.i ]
+  %mul_j = mul nsw i64 %j, 3
+  %index_j = add i64 %mul_j, %i
+  %arrayidxA = getelementptr inbounds [2 x [ 3 x i32]], ptr %A, i64 %j, i64 %i
+  store i32 1, ptr %arrayidxA, align 4
+  %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %j
+  store i32 1, ptr %arrayidxB, align 4
+  %arrayidxC = getelementptr inbounds i32, ptr %C, i64 %j
+  store i32 1, ptr %arrayidxC, align 4
+  %arrayidxD = getelementptr inbounds i32, ptr %D, i64 %j
+  store i32 1, ptr %arrayidxD, align 4
+  %arrayidxE = getelementptr inbounds i32, ptr %E, i64 %j
+  store i32 1, ptr %arrayidxE, align 4
+  %incj = add nsw i64 %j, 1
+  %exitcond.us = icmp eq i64 %incj, 2
+  br i1 %exitcond.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, 3
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
index 965d95110da466..cc787fa55600a6 100644
--- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,42 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
 
 @b = external dso_local global [5 x i32], align 16
 
+;; Not profitable to interchange, because the access is invariant to j loop.
+;;
+;;  for(int i=0;i<4;i++) {
+;;    for(int j=1;j<4;j++) {
+;;      b[i] = ....
+;;    }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test1
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store i32 undef, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -68,41 +51,15 @@ for.cond.for.end5_crit_edge:                      ; preds = %for.inc3
   ret void
 }
 
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[CMP_ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store i32 [[CMP_ZEXT]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body