[llvm] [LoopVectorize][NFC] Rewrite tests to check output of vplan cost model (PR #113697)

Thu Oct 31 03:11:05 PDT 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/113697

>From ff9ba7c18c488f98b3e802966eb08f1ca4d335bf Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 25 Oct 2024 14:36:43 +0000
Subject: [PATCH 1/2] [LoopVectorize][NFC] Rewrite tests to check output of
 vplan cost model

Currently it's not possible to improve the cost model for tail-folded
loops because as soon as you add a VPInstruction::computeCost function
that adds the costs of instructions such as VPInstruction::ActiveLaneMask
and VPInstruction::ExplicitVectorLength the assert in
LoopVectorizationPlanner::computeBestVF fails for some tests. This is
because the VF chosen by the legacy cost model doesn't match the vplan
cost model. See PR #90191. This assert is currently inhibiting attempts
to improve the cost model.

I would like to remove the assert since we've been using the vplan
cost model for 2 months now and that feels long enough to me. However,
in order to do that we have to fix up a whole bunch of tests that rely
upon the legacy cost model output. I've tried my best to update
these tests to use vplan output instead.

There are still a whole bunch of vectoriser tests in

Analysis/CostModel/X86

that depend upon the legacy cost model, and also I feel they shouldn't
really live there either. These can be fixed up in a separate patch!
---
 .../AArch64/aarch64-predication.ll            |   2 +-
 .../extractvalue-no-scalarization-required.ll |  19 +-
 .../LoopVectorize/AArch64/intrinsiccost.ll    |  14 +-
 .../LoopVectorize/AArch64/masked-op-cost.ll   |   4 +-
 .../AArch64/maximize-bandwidth-invalidate.ll  |   8 +-
 .../AArch64/no_vector_instructions.ll         |   2 -
 .../scalable-fp-ext-trunc-illegal-type.ll     |   2 +-
 .../scalable-vectorization-cost-tuning.ll     |  47 +--
 .../LoopVectorize/AArch64/select-costs.ll     |  22 +-
 .../AArch64/sve-inductions-unusual-types.ll   |  12 +-
 .../AArch64/sve-strict-fadd-cost.ll           |  20 +-
 .../AArch64/type-shrinkage-zext-costs.ll      |  34 +-
 .../LoopVectorize/ARM/mve-icmpcost.ll         | 309 ++++++++++--------
 .../LoopVectorize/ARM/mve-saddsatcost.ll      |   8 +-
 .../LoopVectorize/ARM/mve-selectandorcost.ll  |   4 +-
 .../LoopVectorize/ARM/mve-shiftcost.ll        |   4 +-
 .../LoopVectorize/RISCV/force-vect-msg.ll     |   2 +-
 .../SystemZ/mem-interleaving-costs-02.ll      |  31 +-
 .../Transforms/LoopVectorize/X86/fneg-cost.ll |   4 +-
 .../LoopVectorize/X86/mul_slm_16bit.ll        |  28 +-
 .../LoopVectorize/X86/reduction-small-size.ll |  37 ++-
 .../LoopVectorize/X86/redundant-vf2-cost.ll   |  10 +-
 .../X86/uint64_to_fp64-cost-model.ll          |   4 +-
 .../LoopVectorize/X86/uniformshift.ll         |   4 +-
 .../X86/vector-scalar-select-cost.ll          |   8 +-
 25 files changed, 362 insertions(+), 277 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index dd3d4215a3f63a..e6d93ea192e56b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        Cost of 5 for VF 2: profitable to scalarize   %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 28830f9bcd11e1..aa78113ebaa48c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -11,9 +11,14 @@
 ; CM: LV: Found uniform instruction:   %a = extractvalue { i64, i64 } %sv, 0
 ; CM: LV: Found uniform instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
+
 ; CM: LV: Scalar loop costs: 5.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -58,12 +63,14 @@ exit:
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @powf(float, float) readnone nounwind
 
-; CM: LV: Found uniform instruction:   %a = extractvalue { float, float } %sv, 0
-; CM: LV: Found uniform instruction:   %b = extractvalue { float, float } %sv, 1
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
 
 ; CM: LV: Scalar loop costs: 14.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index c9990487665b98..064eead452852d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-COST-LABEL: sadd
 ; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @saddsat(
@@ -95,10 +95,10 @@ while.end:                                        ; preds = %while.body, %entry
 
 ; CHECK-COST-LABEL: umin
 ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
 
 define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @umin(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 6b4cfa091c45e7..93bc131ee5c5a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
+; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
index 5b90456a4f458b..05e0b7ca7520f2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -10,10 +10,10 @@ target triple = "aarch64"
 ; due to invalid cost decisions. The loop below has a low maximum trip count,
 ; so will be masked.
 
-; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
+; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
 ; COST: LV: Selecting VF: 1.
 
 define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 785241d342ddc5..089d279d152455 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
 
 ; CHECK-LABEL: PR33193
 ; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
 ; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
 %struct.a = type { i32, i8 }
 define void @PR33193(ptr %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index 92b043a9c29d50..fb5ff66989a679 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;; registers required for a <vscale x 4 x fp128> when trying to maximize
 ;; vector bandwidth with SVE.
 
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
+; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext  ir<%load.in> to fp128
 
 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK-LABEL: define void @load_ext_trunc_store(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index f28f77bf1b1558..36c35d5a154d94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,49 +1,55 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
-; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
+
+; GENERIC: Cost for VF vscale x 2: 11
+; GENERIC: Cost for VF vscale x 4: 11
+; GENERIC: LV: Selecting VF: vscale x 16
 
-; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: Cost for VF vscale x 2: 11
+; NEOVERSE-V1: Cost for VF vscale x 4: 11
+; NEOVERSE-V1: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-N2: Cost for VF vscale x 2: 11
+; NEOVERSE-N2: Cost for VF vscale x 4: 11
+; NEOVERSE-N2: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
-; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+; NEOVERSE-V2: Cost for VF vscale x 2: 11
+; NEOVERSE-V2: Cost for VF vscale x 4: 11
+; NEOVERSE-V2: LV: Selecting VF: 16
 
-; VF-4: <4 x i32>
-; VF-VSCALE4: <16 x i32>
+; VF-16: <16 x i8>
+; VF-VSCALE16: <vscale x 16 x i8>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
   %1 = load i8, ptr %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx5, align 4
+  %add = add nsw i8 %0, %1
+  %arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
+  store i8 %add, ptr %arrayidx5, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %exit, label %loop
@@ -51,4 +57,3 @@ loop:
 exit:
   ret void
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 2bcc93127da1e0..6165a73e77f238 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
 
 define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
 ; CHECK: LV: Checking a loop in 'selects_1'
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
 ; CHECK: LV: Selecting VF: 4
 
 entry:
@@ -48,9 +50,11 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK: LV: Checking a loop in 'multi_user_cmp'
-; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction:   %cmp1 = fcmp olt float %load1, 0.000000e+00
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
+; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
 ; CHECK: LV: Selecting VF: 16.
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 961fa59cadd360..b418905e514af0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -4,9 +4,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %addi7 = add i7 %indvars.iv1294, 0
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
 
 define void @induction_i7(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end:                                          ; preds = %for.body
 }
 
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %zexti3 = zext i3 %indvars.iv1294 to i64
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 83bd07a3ec02cf..b84e8de678140f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -7,10 +7,15 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK: Cost of 8 for VF vscale x 4:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
@@ -31,8 +36,11 @@ for.end:
 }
 
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd double %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index dec3c286345adf..b28214317b25e2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -8,15 +8,14 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
-; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = zext i8 %0 to i32
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -91,15 +90,14 @@ exit:                                 ; preds = %for.body
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = sext i8 %0 to i32
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index e7a361ecdbff00..929431adaafc4d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -18,48 +18,57 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 ; CHECK: LV: Scalar loop costs: 5.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 20 for VF 2 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 2 costs: 43.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 4 costs: 2.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 36 for VF 8 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 8 costs: 5.
+; CHECK: Cost of 1 for VF 2: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 16 for VF 2: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 86
+; CHECK: Cost of 1 for VF 4: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 10
+; CHECK: Cost of 1 for VF 8: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 46
 ; CHECK: LV: Selecting VF: 4.
 define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
 entry:
@@ -115,90 +124,134 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp.not = icmp eq i32 %dec, 0
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
 ; CHECK: LV: Scalar loop costs: 9.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 22 for VF 2 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 2 costs: 65.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 4 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 8 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: Cost of 1 for VF 2: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 2: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 18 for VF 2: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 130
+; CHECK: Cost of 1 for VF 4: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 4: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 4: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 14
+; CHECK: Cost of 1 for VF 8: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 8: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 8: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 8: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 8: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 8: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 26
+; CHECK: Cost of 1 for VF 16: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 16: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 16: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 16: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 16: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 16: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 16: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 16: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 16: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 16: 50
 ; CHECK: LV: Selecting VF: 16.
 define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
@@ -238,8 +291,8 @@ while.end:                                        ; preds = %while.end.loopexit,
 }
 
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction:   %cmp1 = fcmp
+; CHECK: Cost of 12 for VF 2: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
+; CHECK: Cost of 24 for VF 4: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
 define void @floatcmp(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index bd7bafdb6dc8dc..04a97f451770ad 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -7,10 +7,10 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
-; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 36 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 8 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 2 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @arm_offset_q15(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index fc56754166d609..8bd0feee862244 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 26 for VF 2 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
 
 define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %blockSize) #0 {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
index a0ff7629b42c3a..e1b7b935a47f61 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-none-none-eabi"
 ; CHECK-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %and515 = shl i32 %l41, 3
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%and515> = shl ir<%l41>, ir<3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%l45> = and ir<%and515>, ir<131072>
 ; CHECK-NOT: vector.body
 
 define void @test(ptr %src, i32 %N) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
index f33ef9199ed1a9..61c0c85f3e556c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
@@ -4,7 +4,7 @@
 ; CHECK: LV: Loop hints: force=enabled
 ; CHECK: LV: Scalar loop costs: 4.
 ; ChosenFactor.Cost is 4, but the real cost will be divided by the width, which is 2.
-; CHECK: LV: Vector loop of width 2 costs: 2.
+; CHECK: Cost for VF 2: 4
 ; Regardless of force vectorization or not, this loop will eventually be vectorized because of the cost model.
 ; Therefore, the following message does not need to be printed even if vectorization is explicitly forced in the metadata.
 ; CHECK-NOT: LV: Vectorization seems to be not beneficial, but was forced by a user.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
index c75899e2224f19..a0c45962021c8b 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -11,10 +11,12 @@
 ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun0'
-; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 4 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
+
 define void @fun0(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -48,7 +50,8 @@ for.end:
 ; which gives a cost of 5.
 ;
 ; CHECK: LV: Checking a loop in 'fun1'
-; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 3 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
 define void @fun1(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -73,10 +76,11 @@ for.end:
 ; produce the vector values, which gives a cost of 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun2'
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 32 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun2(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -112,10 +116,11 @@ for.end:
 ; vector register boundary.
 ;
 ; CHECK: LV: Checking a loop in 'fun3'
-; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 30 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun3(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
index dae341bcf53800..f7d77bafc2a115 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %neg = fneg float %{{.*}}
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%neg> = fneg ir<%0>
+; CHECK: Cost of 1 for VF 4: WIDEN ir<%neg> = fneg ir<%0>
 define void @fneg_cost(ptr %a, i64 %n) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
index 654376cc54f4a0..a6d38fb794d444 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -32,38 +32,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i8 %1 to i32
 ; sources of the mul is sext\sext from i8
 ; use pmullw\sext seq.
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i8
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i8 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i8
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i8 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-120
 ; use pmullw\sext
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 -120, %conv3
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul4> = mul nsw ir<-120>, ir<%conv3>
   %mul4 = mul nsw i32 -120, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\250
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv3
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul5> = mul nsw ir<250>, ir<%conv3>
   %mul5 = mul nsw i32 250, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-120
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 -120, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul6> = mul nsw ir<-120>, ir<%conv4>
   %mul6 = mul nsw i32 -120, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\250
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
@@ -101,38 +101,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i16 %1 to i32
 ; sources of the mul is sext\sext from i16
 ; use pmulhw\pmullw\pshuf seq.
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i16
 ; use pmulld
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i16 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i16
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i16 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-32000
 ; use pmulhw\pmullw\sext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 -32000, %conv3
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul4> = mul nsw ir<-32000>, ir<%conv3>
   %mul4 = mul nsw i32 -32000, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\64000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 64000, %conv3
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul5> = mul nsw ir<64000>, ir<%conv3>
   %mul5 = mul nsw i32 64000, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-32000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 -32000, %conv4
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul6> = mul nsw ir<-32000>, ir<%conv4>
   %mul6 = mul nsw i32 -32000, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\64000
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 7d435cc85c9dfb..052a963f5458bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -28,21 +28,28 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond = icmp eq i32 %lftr.wideiv, %n
+; CHECK: Cost of 0 for VF 2: exit condition instruction   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi ir<0>, vp<%8>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc  ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext  vp<%7> to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
 ;
 define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
index 1660f1af06c6cd..25796c7c87e4e5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -4,11 +4,11 @@
 ; Check that cost model is not executed twice for VF=2 when vectorization is
 ; forced for a particular loop.
 
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%4>
+; CHECK: Cost of 1 for VF 2: WIDEN store vp<%5>, ir<%val>
+; CHECK-NOT: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%4>
+; CHECK-NOT: Cost of 1 for VF 2: WIDEN store vp<%5>, ir<%val>
+; CHECK: Cost for VF 2: 5
 
 define i32 @foo(ptr %A, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index ef0402aaae802f..b8dcfd31bbc4c0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 10 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
index d56740c8293c09..166875dd55aaef 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -3,8 +3,8 @@
 
 ; CHECK: 'foo'
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
 define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 {
 entry:
   br label %body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index 234587aae1283c..22eb0ca3800333 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -23,8 +23,8 @@ define void @scalarselect(i1 %cond) {
   %7 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %cond, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
 
   %sel = select i1 %cond, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4
@@ -52,8 +52,8 @@ define void @vectorselect(i1 %cond) {
   %8 = icmp ult i64 %indvars.iv, 8
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %8, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
 
   %sel = select i1 %8, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4

>From 9bb9e3f2e1de673004698a8a10301f4154acacad Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 31 Oct 2024 10:08:56 +0000
Subject: [PATCH 2/2] Address review comments

* Try to make tests more robust.
* Re-add some CHECK lines.
* Added more vectoriser cost model debug output showing
estimated cost per lane.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp  | 10 +++++++++-
 .../scalable-vectorization-cost-tuning.ll        | 16 ++++++++--------
 .../AArch64/type-shrinkage-zext-costs.ll         |  1 +
 .../Transforms/LoopVectorize/ARM/mve-icmpcost.ll | 14 ++++++++------
 .../LoopVectorize/RISCV/force-vect-msg.ll        |  2 +-
 .../LoopVectorize/X86/redundant-vf2-cost.ll      | 10 +++++-----
 6 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..86960a1cfcf5d9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7382,7 +7382,15 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
-  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+#ifndef NDEBUG
+  unsigned EstimatedWidth = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
+      EstimatedWidth *= *VScale;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
+                    << " (Estimated cost per lane: " << (Cost / EstimatedWidth)
+                    << ")\n");
+#endif
   return Cost;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index 36c35d5a154d94..16afd3f7610260 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -19,20 +19,20 @@
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
 
-; GENERIC: Cost for VF vscale x 2: 11
-; GENERIC: Cost for VF vscale x 4: 11
+; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2)
+; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1)
 ; GENERIC: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V1: Cost for VF vscale x 2: 11
-; NEOVERSE-V1: Cost for VF vscale x 4: 11
+; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2)
+; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1)
 ; NEOVERSE-V1: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-N2: Cost for VF vscale x 2: 11
-; NEOVERSE-N2: Cost for VF vscale x 4: 11
+; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5)
+; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2)
 ; NEOVERSE-N2: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V2: Cost for VF vscale x 2: 11
-; NEOVERSE-V2: Cost for VF vscale x 4: 11
+; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5)
+; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2)
 ; NEOVERSE-V2: LV: Selecting VF: 16
 
 ; VF-16: <16 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index b28214317b25e2..468cc8f2a72784 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -16,6 +16,7 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
 ; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
 ; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index 929431adaafc4d..44bb791d687ea9 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -4,6 +4,7 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
+; CHECK-LABEL: LV: Checking a loop in 'expensive_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i16, ptr %arrayidx, align 2
@@ -34,7 +35,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 16 for VF 2: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 2: 86
+; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43)
 ; CHECK: Cost of 1 for VF 4: induction instruction   %inc = add nuw nsw i32 %i.016, 1
 ; CHECK: Cost of 0 for VF 4: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: Cost of 1 for VF 4: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
@@ -51,7 +52,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 4: 10
+; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2)
 ; CHECK: Cost of 1 for VF 8: induction instruction   %inc = add nuw nsw i32 %i.016, 1
 ; CHECK: Cost of 0 for VF 8: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
@@ -68,7 +69,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 8: 46
+; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5)
 ; CHECK: LV: Selecting VF: 4.
 define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
 entry:
@@ -103,6 +104,7 @@ for.inc:                                          ; preds = %for.body, %if.then
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; CHECK-LABEL: LV: Checking a loop in 'cheap_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
@@ -155,7 +157,7 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<%8>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 2: 130
+; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65)
 ; CHECK: Cost of 1 for VF 4: induction instruction   %dec = add i32 %blkCnt.012, -1
 ; CHECK: Cost of 0 for VF 4: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
@@ -187,7 +189,7 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<%8>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 4: 14
+; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3)
 ; CHECK: Cost of 1 for VF 8: induction instruction   %dec = add i32 %blkCnt.012, -1
 ; CHECK: Cost of 0 for VF 8: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
@@ -219,7 +221,7 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<%8>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
 ; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3)
 ; CHECK: Cost of 1 for VF 16: induction instruction   %dec = add i32 %blkCnt.012, -1
 ; CHECK: Cost of 0 for VF 16: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
index 61c0c85f3e556c..988e3854c9c7b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
@@ -4,7 +4,7 @@
 ; CHECK: LV: Loop hints: force=enabled
 ; CHECK: LV: Scalar loop costs: 4.
 ; ChosenFactor.Cost is 4, but the real cost will be divided by the width, which is 2.
-; CHECK: Cost for VF 2: 4
+; CHECK: Cost for VF 2: 4 (Estimated cost per lane: 2)
 ; Regardless of force vectorization or not, this loop will eventually be vectorized because of the cost model.
 ; Therefore, the following message does not need to be printed even if vectorization is explicitly forced in the metadata.
 ; CHECK-NOT: LV: Vectorization seems to be not beneficial, but was forced by a user.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
index 25796c7c87e4e5..3eaa019659ee69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -4,11 +4,11 @@
 ; Check that cost model is not executed twice for VF=2 when vectorization is
 ; forced for a particular loop.
 
-; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%4>
-; CHECK: Cost of 1 for VF 2: WIDEN store vp<%5>, ir<%val>
-; CHECK-NOT: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%4>
-; CHECK-NOT: Cost of 1 for VF 2: WIDEN store vp<%5>, ir<%val>
-; CHECK: Cost for VF 2: 5
+; CHECK: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK: Cost for VF 2: 5 (Estimated cost per lane: 2)
 
 define i32 @foo(ptr %A, i32 %n) {
 entry: