[llvm] [LoopVectorize][NFC] Rewrite tests to check output of vplan cost model (PR #113697)

Thu Nov 14 04:46:48 PST 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/113697

>From f15cf5f03ab3dab838f466e219952743371bba11 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 14 Nov 2024 12:45:13 +0000
Subject: [PATCH] [LoopVectorize][NFC] Rewrite tests to check output of vplan
 cost model

Currently it's very difficult to improve the cost model for tail-folded
loops because as soon as you add a VPInstruction::computeCost function
that adds the costs of instructions such as VPInstruction::ActiveLaneMask
and VPInstruction::ExplicitVectorLength the assert in
LoopVectorizationPlanner::computeBestVF fails for some tests. This is
because the VF chosen by the legacy cost model doesn't match the vplan
cost model. See PR #90191. This assert is currently making it difficult
to improve the cost model.

Hopefully we will be in a position to remove the assert soon, however
in order to do that we have to fix up a whole bunch of tests that rely
upon the legacy cost model output. I've tried my best to update
these tests to use vplan output instead.

There is still work needed for the VF=1 case because the vplan cost
model is not printed out in this case. I've not attempted to fix those
in this patch.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  19 +-
 .../AArch64/aarch64-predication.ll            |   2 +-
 .../extractvalue-no-scalarization-required.ll |  19 +-
 .../LoopVectorize/AArch64/intrinsiccost.ll    |  14 +-
 .../LoopVectorize/AArch64/masked-op-cost.ll   |   4 +-
 .../AArch64/maximize-bandwidth-invalidate.ll  |   8 +-
 .../AArch64/no_vector_instructions.ll         |   2 -
 .../scalable-fp-ext-trunc-illegal-type.ll     |   2 +-
 .../scalable-vectorization-cost-tuning.ll     |  47 +--
 .../LoopVectorize/AArch64/select-costs.ll     |  22 +-
 .../AArch64/sve-inductions-unusual-types.ll   |  12 +-
 .../AArch64/sve-strict-fadd-cost.ll           |  20 +-
 .../AArch64/type-shrinkage-zext-costs.ll      |  33 +-
 .../LoopVectorize/ARM/mve-icmpcost.ll         | 311 +++++++++++-------
 .../LoopVectorize/ARM/mve-saddsatcost.ll      |   8 +-
 .../LoopVectorize/ARM/mve-selectandorcost.ll  |   4 +-
 .../LoopVectorize/ARM/mve-shiftcost.ll        |   4 +-
 .../LoopVectorize/RISCV/force-vect-msg.ll     |   2 +-
 .../SystemZ/mem-interleaving-costs-02.ll      |  31 +-
 .../Transforms/LoopVectorize/X86/fneg-cost.ll |   4 +-
 .../LoopVectorize/X86/mul_slm_16bit.ll        |  28 +-
 .../LoopVectorize/X86/reduction-small-size.ll |  37 ++-
 .../LoopVectorize/X86/redundant-vf2-cost.ll   |  10 +-
 .../X86/uint64_to_fp64-cost-model.ll          |   4 +-
 .../LoopVectorize/X86/uniformshift.ll         |   4 +-
 .../X86/vector-scalar-select-cost.ll          |   8 +-
 26 files changed, 382 insertions(+), 277 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ebc62f9843905..c1f632a7c0d3be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -131,6 +131,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
@@ -7403,7 +7404,23 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
-  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+#ifndef NDEBUG
+  unsigned EstimatedWidth = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
+      EstimatedWidth *= *VScale;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
+                    << " (Estimated cost per lane: ");
+  if (Cost.isValid()) {
+    std::string CostString;
+    raw_string_ostream CostStream(CostString);
+    double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
+    llvm::write_double(CostStream, CostPerLane, FloatStyle::Fixed, 1);
+    LLVM_DEBUG(dbgs() << CostString);
+  } else /* No point dividing an invalid cost - it will still be invalid */
+    LLVM_DEBUG(dbgs() << "Invalid");
+  LLVM_DEBUG(dbgs() << ")\n");
+#endif
   return Cost;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index dd3d4215a3f63a..e6d93ea192e56b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        Cost of 5 for VF 2: profitable to scalarize   %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 28830f9bcd11e1..aa78113ebaa48c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -11,9 +11,14 @@
 ; CM: LV: Found uniform instruction:   %a = extractvalue { i64, i64 } %sv, 0
 ; CM: LV: Found uniform instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
+
 ; CM: LV: Scalar loop costs: 5.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -58,12 +63,14 @@ exit:
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @powf(float, float) readnone nounwind
 
-; CM: LV: Found uniform instruction:   %a = extractvalue { float, float } %sv, 0
-; CM: LV: Found uniform instruction:   %b = extractvalue { float, float } %sv, 1
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
 
 ; CM: LV: Scalar loop costs: 14.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index c9990487665b98..064eead452852d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-COST-LABEL: sadd
 ; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @saddsat(
@@ -95,10 +95,10 @@ while.end:                                        ; preds = %while.body, %entry
 
 ; CHECK-COST-LABEL: umin
 ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
 
 define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @umin(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 6b4cfa091c45e7..93bc131ee5c5a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
+; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
index 5b90456a4f458b..05e0b7ca7520f2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -10,10 +10,10 @@ target triple = "aarch64"
 ; due to invalid cost decisions. The loop below has a low maximum trip count,
 ; so will be masked.
 
-; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
+; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
 ; COST: LV: Selecting VF: 1.
 
 define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 785241d342ddc5..089d279d152455 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
 
 ; CHECK-LABEL: PR33193
 ; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
 ; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
 %struct.a = type { i32, i8 }
 define void @PR33193(ptr %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index 92b043a9c29d50..fb5ff66989a679 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;; registers required for a <vscale x 4 x fp128> when trying to maximize
 ;; vector bandwidth with SVE.
 
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
+; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext  ir<%load.in> to fp128
 
 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK-LABEL: define void @load_ext_trunc_store(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index f28f77bf1b1558..62c9f22b82c89e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,49 +1,55 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
-; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
+
+; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
+; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
+; GENERIC: LV: Selecting VF: vscale x 16
 
-; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
+; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
+; NEOVERSE-V1: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
+; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
+; NEOVERSE-N2: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
-; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
+; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
+; NEOVERSE-V2: LV: Selecting VF: 16
 
-; VF-4: <4 x i32>
-; VF-VSCALE4: <16 x i32>
+; VF-16: <16 x i8>
+; VF-VSCALE16: <vscale x 16 x i8>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
   %1 = load i8, ptr %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx5, align 4
+  %add = add nsw i8 %0, %1
+  %arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
+  store i8 %add, ptr %arrayidx5, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %exit, label %loop
@@ -51,4 +57,3 @@ loop:
 exit:
   ret void
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 2bcc93127da1e0..6165a73e77f238 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
 
 define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
 ; CHECK: LV: Checking a loop in 'selects_1'
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
 ; CHECK: LV: Selecting VF: 4
 
 entry:
@@ -48,9 +50,11 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK: LV: Checking a loop in 'multi_user_cmp'
-; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction:   %cmp1 = fcmp olt float %load1, 0.000000e+00
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
+; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
 ; CHECK: LV: Selecting VF: 16.
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 961fa59cadd360..b418905e514af0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -4,9 +4,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %addi7 = add i7 %indvars.iv1294, 0
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
 
 define void @induction_i7(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end:                                          ; preds = %for.body
 }
 
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %zexti3 = zext i3 %indvars.iv1294 to i64
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 83bd07a3ec02cf..b84e8de678140f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -7,10 +7,15 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK: Cost of 8 for VF vscale x 4:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
@@ -31,8 +36,11 @@ for.end:
 }
 
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd double %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index dec3c286345adf..468cc8f2a72784 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = zext i8 %0 to i32
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -91,15 +91,14 @@ exit:                                 ; preds = %for.body
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = sext i8 %0 to i32
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index e7a361ecdbff00..dab7c22b9fde4a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -4,6 +4,7 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
+; CHECK-LABEL: LV: Checking a loop in 'expensive_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i16, ptr %arrayidx, align 2
@@ -18,48 +19,57 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 ; CHECK: LV: Scalar loop costs: 5.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 20 for VF 2 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 2 costs: 43.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 4 costs: 2.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 36 for VF 8 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 8 costs: 5.
+; CHECK: Cost of 1 for VF 2: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 16 for VF 2: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43.0)
+; CHECK: Cost of 1 for VF 4: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2.5)
+; CHECK: Cost of 1 for VF 8: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5.8)
 ; CHECK: LV: Selecting VF: 4.
 define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
 entry:
@@ -94,6 +104,7 @@ for.inc:                                          ; preds = %for.body, %if.then
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; CHECK-LABEL: LV: Checking a loop in 'cheap_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
@@ -115,90 +126,134 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp.not = icmp eq i32 %dec, 0
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
 ; CHECK: LV: Scalar loop costs: 9.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 22 for VF 2 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 2 costs: 65.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 4 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 8 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: Cost of 1 for VF 2: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 2: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 18 for VF 2: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65.0)
+; CHECK: Cost of 1 for VF 4: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 4: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 4: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3.5)
+; CHECK: Cost of 1 for VF 8: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 8: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 8: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 8: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 8: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 8: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3.2)
+; CHECK: Cost of 1 for VF 16: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 16: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 16: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 16: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 16: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 16: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 16: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 16: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 16: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 16: 50
 ; CHECK: LV: Selecting VF: 16.
 define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
@@ -238,8 +293,8 @@ while.end:                                        ; preds = %while.end.loopexit,
 }
 
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction:   %cmp1 = fcmp
+; CHECK: Cost of 12 for VF 2: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
+; CHECK: Cost of 24 for VF 4: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
 define void @floatcmp(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index bd7bafdb6dc8dc..04a97f451770ad 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -7,10 +7,10 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
-; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 36 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 8 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 2 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @arm_offset_q15(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index 60c3b52f2003a6..025b8738f78290 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 26 for VF 2 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
 
 define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %blockSize) #0 {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
index a0ff7629b42c3a..e1b7b935a47f61 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-none-none-eabi"
 ; CHECK-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %and515 = shl i32 %l41, 3
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%and515> = shl ir<%l41>, ir<3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%l45> = and ir<%and515>, ir<131072>
 ; CHECK-NOT: vector.body
 
 define void @test(ptr %src, i32 %N) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
index f33ef9199ed1a9..09a1aaab6cc2da 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
@@ -4,7 +4,7 @@
 ; CHECK: LV: Loop hints: force=enabled
 ; CHECK: LV: Scalar loop costs: 4.
 ; ChosenFactor.Cost is 4, but the real cost will be divided by the width, which is 2.
-; CHECK: LV: Vector loop of width 2 costs: 2.
+; CHECK: Cost for VF 2: 4 (Estimated cost per lane: 2.0)
 ; Regardless of force vectorization or not, this loop will eventually be vectorized because of the cost model.
 ; Therefore, the following message does not need to be printed even if vectorization is explicitly forced in the metadata.
 ; CHECK-NOT: LV: Vectorization seems to be not beneficial, but was forced by a user.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
index c75899e2224f19..a0c45962021c8b 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -11,10 +11,12 @@
 ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun0'
-; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 4 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
+
 define void @fun0(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -48,7 +50,8 @@ for.end:
 ; which gives a cost of 5.
 ;
 ; CHECK: LV: Checking a loop in 'fun1'
-; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 3 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
 define void @fun1(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -73,10 +76,11 @@ for.end:
 ; produce the vector values, which gives a cost of 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun2'
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 32 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun2(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -112,10 +116,11 @@ for.end:
 ; vector register boundary.
 ;
 ; CHECK: LV: Checking a loop in 'fun3'
-; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 30 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun3(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
index dae341bcf53800..f7d77bafc2a115 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %neg = fneg float %{{.*}}
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%neg> = fneg ir<%0>
+; CHECK: Cost of 1 for VF 4: WIDEN ir<%neg> = fneg ir<%0>
 define void @fneg_cost(ptr %a, i64 %n) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
index 654376cc54f4a0..a6d38fb794d444 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -32,38 +32,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i8 %1 to i32
 ; sources of the mul is sext\sext from i8
 ; use pmullw\sext seq.
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i8
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i8 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i8
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i8 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-120
 ; use pmullw\sext
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 -120, %conv3
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul4> = mul nsw ir<-120>, ir<%conv3>
   %mul4 = mul nsw i32 -120, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\250
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv3
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul5> = mul nsw ir<250>, ir<%conv3>
   %mul5 = mul nsw i32 250, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-120
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 -120, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul6> = mul nsw ir<-120>, ir<%conv4>
   %mul6 = mul nsw i32 -120, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\250
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
@@ -101,38 +101,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i16 %1 to i32
 ; sources of the mul is sext\sext from i16
 ; use pmulhw\pmullw\pshuf seq.
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i16
 ; use pmulld
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i16 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i16
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i16 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-32000
 ; use pmulhw\pmullw\sext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 -32000, %conv3
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul4> = mul nsw ir<-32000>, ir<%conv3>
   %mul4 = mul nsw i32 -32000, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\64000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 64000, %conv3
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul5> = mul nsw ir<64000>, ir<%conv3>
   %mul5 = mul nsw i32 64000, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-32000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 -32000, %conv4
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul6> = mul nsw ir<-32000>, ir<%conv4>
   %mul6 = mul nsw i32 -32000, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\64000
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 7d435cc85c9dfb..052a963f5458bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -28,21 +28,28 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond = icmp eq i32 %lftr.wideiv, %n
+; CHECK: Cost of 0 for VF 2: exit condition instruction   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi ir<0>, vp<%8>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc  ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext  vp<%7> to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
 ;
 define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
index 1660f1af06c6cd..c8f85ba383aebe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -4,11 +4,11 @@
 ; Check that cost model is not executed twice for VF=2 when vectorization is
 ; forced for a particular loop.
 
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+; CHECK: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK: Cost for VF 2: 5 (Estimated cost per lane: 2.5)
 
 define i32 @foo(ptr %A, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index ef0402aaae802f..b8dcfd31bbc4c0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 10 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
index d56740c8293c09..166875dd55aaef 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -3,8 +3,8 @@
 
 ; CHECK: 'foo'
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
 define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 {
 entry:
   br label %body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index 234587aae1283c..22eb0ca3800333 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -23,8 +23,8 @@ define void @scalarselect(i1 %cond) {
   %7 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %cond, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
 
   %sel = select i1 %cond, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4
@@ -52,8 +52,8 @@ define void @vectorselect(i1 %cond) {
   %8 = icmp ult i64 %indvars.iv, 8
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %8, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
 
   %sel = select i1 %8, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4