[llvm] [Support] Always scale InstructionCost::Value (PR #178962)

Ryan Buchner via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 10 01:40:27 PDT 2026


https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/178962

>From 8768038e113ccfd79ad9621b4001b7d300ec2f2b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 30 Jan 2026 12:26:39 -0800
Subject: [PATCH 01/15] [Support] Always scale InstructionCost::Value

Allows for fractional Instruction costs up to a granularity with little overhead.

Likely that I need update some tests from other backends.

The only functional change because of this commit is that finer granularity is
now supported. In LoopVectorizer, there are some calculations that divide a cost
by some value. Before, a rounded answer was produced, but now the result is more
accurate since we can represent a fractional cost. For instance:

Before:
InstructionCost(3) / 6 = 0

After (with ScalingFactor 120):
InstructionCost(3) / 6 = 60 / 120

Also, there is a decrease in the maximum value of InstructionCost, as the largest
value is now `std::numeric_limits<CostType>::max() / ScalingFactor`.
---
 llvm/include/llvm/Support/InstructionCost.h   | 30 ++++++-
 llvm/lib/Support/InstructionCost.cpp          |  2 +-
 .../interchange-refcost-overflow.ll           |  6 +-
 .../LoopVectorize/AArch64/call-costs.ll       | 84 +++++++++++++++----
 .../LoopVectorize/RISCV/predicated-costs.ll   | 32 ++++---
 5 files changed, 120 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index 507c16666b958..a8237694050f9 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -59,6 +59,10 @@ class InstructionCost {
       State = Invalid;
   }
 
+  // 120 chosen since least common factor of 2, 3, 4, 5, 6, 8
+  // which are realistic issue widths
+  static constexpr CostType ScalingFactor = 120;
+
   static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
   static constexpr CostType MinValue = std::numeric_limits<CostType>::min();
 
@@ -67,7 +71,16 @@ class InstructionCost {
   InstructionCost() = default;
 
   InstructionCost(CostState) = delete;
-  InstructionCost(CostType Val) : Value(Val), State(Valid) {}
+  InstructionCost(CostType Val) : Value(), State(Valid) {
+    InstructionCost::CostType Result;
+    if (MulOverflow(Val, ScalingFactor, Result)) {
+      if (Val > 0)
+        Result = MaxValue;
+      else
+        Result = MinValue;
+    }
+    Value = Result;
+  }
 
   static InstructionCost getMax() { return MaxValue; }
   static InstructionCost getMin() { return MinValue; }
@@ -87,7 +100,7 @@ class InstructionCost {
   /// and comparisons.
   CostType getValue() const {
     assert(isValid());
-    return Value;
+    return Value / ScalingFactor;
   }
 
   /// For all of the arithmetic operators provided here any invalid state is
@@ -141,6 +154,8 @@ class InstructionCost {
         Result = MaxValue;
       else
         Result = MinValue;
+    } else {
+      Result /= ScalingFactor;
     }
 
     Value = Result;
@@ -155,7 +170,16 @@ class InstructionCost {
 
   InstructionCost &operator/=(const InstructionCost &RHS) {
     propagateState(RHS);
-    Value /= RHS.Value;
+    // Saturating multiply.
+    InstructionCost::CostType Result;
+    if (MulOverflow(Value, ScalingFactor, Result)) {
+      if (Value > 0)
+        Result = MaxValue;
+      else
+        Result = MinValue;
+    }
+    Result /= RHS.Value;
+    Value = Result;
     return *this;
   }
 
diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index c485ce9107af9..6b4eb6d2f1ed6 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
   if (isValid())
-    OS << Value;
+    OS << (Value / ScalingFactor);
   else
     OS << "Invalid";
 }
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index 52a530b2feebb..88f265872902e 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 9223372036854775807
-; CHECK: Loop 'middle.loop' has cost = 9223372036854775807
-; CHECK: Loop 'inner.loop' has cost = 9223372036854775807
+; CHECK: Loop 'outer.loop' has cost = 76861433640456465
+; CHECK: Loop 'middle.loop' has cost = 76861433640456465
+; CHECK: Loop 'inner.loop' has cost = 76861433640456465
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 95b4dcb23dd47..abb215ae595d7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -104,26 +104,81 @@ exit:
 define void @call_scalarized(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-LABEL: define void @call_scalarized(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 100, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = sub i64 100, [[INDEX]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[CMP295:%.*]] = fcmp une double [[L]], 4.000000e+00
-; CHECK-NEXT:    [[CMP299:%.*]] = fcmp ugt double [[L]], 0.000000e+00
-; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP295]], [[CMP299]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
-; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[GEP_SRC]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[TMP2]], i64 -1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[GEP_SRC]], i64 -2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i64 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x double> [[WIDE_LOAD]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <2 x double> [[WIDE_LOAD1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <2 x double> [[REVERSE]], splat (double 4.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp une <2 x double> [[REVERSE2]], splat (double 4.000000e+00)
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ugt <2 x double> [[REVERSE]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp ugt <2 x double> [[REVERSE2]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP10]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true)
+; CHECK-NEXT:    [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP_LATCH:.*]], label %[[THEN:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[IV]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], -1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]]
+; CHECK-NEXT:    [[L:%.*]] = extractelement <2 x double> [[REVERSE]], i32 0
 ; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[L]])
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    store double [[SQRT]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br label %[[THEN]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[IV]], -1
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], -1
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[REVERSE]], i32 1
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.sqrt.f64(double [[TMP24]])
+; CHECK-NEXT:    store double [[TMP25]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[EXIT:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[IV]], -2
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], -1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[REVERSE2]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = call double @llvm.sqrt.f64(double [[TMP30]])
+; CHECK-NEXT:    store double [[TMP31]], ptr [[TMP29]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[IV]], -3
+; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], -1
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[REVERSE2]], i32 1
+; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.sqrt.f64(double [[TMP36]])
+; CHECK-NEXT:    store double [[TMP37]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -211,4 +266,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
index e0ab30b0ae5cc..2316a478becc5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
@@ -8,23 +8,29 @@
 define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK-LABEL: define void @nested(
 ; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]]
-; CHECK:       [[THEN_0]]:
-; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[LOOP]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[THEN_1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 1024, %[[LOOP]] ], [ [[AVL_NEXT:%.*]], %[[THEN_1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[P0]], i32 [[IV1]]
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i32 [[X]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP1]], align 4
-; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[GEP2]], <vscale x 4 x i1> [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P1]], <vscale x 4 x i32> [[VP_OP_LOAD]]
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> align 4 [[TMP3]], <vscale x 4 x i1> [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[IV1]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[THEN_1]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -80,7 +86,7 @@ define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:

>From 74f68f0a3ba1f054b59f76a9c69bb3b57d23b90a Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 4 Feb 2026 16:28:46 -0800
Subject: [PATCH 02/15] [X86] Update X86 tests

---
 .../LoopVectorize/X86/cost-model-i386.ll      |  49 ++++-
 .../LoopVectorize/X86/masked_load_store.ll    |  68 ++++---
 .../LoopVectorize/X86/predicate-switch.ll     | 185 ++++++++++++++++--
 .../X86/replicate-uniform-call.ll             |  44 ++++-
 .../X86/pr48844-br-to-switch-vectorization.ll |  82 +++++++-
 5 files changed, 377 insertions(+), 51 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
index 14f20464093cf..e53fdbc12d919 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
@@ -6,26 +6,63 @@ target triple = "i386-unknow-linux"
 define void @icmp_predicate_and_branch_cost(i32 %size, ptr %dst, i64 %conv5.i) #0 {
 ; CHECK-LABEL: @icmp_predicate_and_branch_cost(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[N_VEC]], 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[CONV5_I:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[SIZE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i32> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i64> [[TMP4]], splat (i64 8)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt <16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp uge <16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[BROADCAST_SPLAT2]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP6]], <16 x i1> [[TMP7]], <16 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> zeroinitializer, <16 x i8> [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[PREDPHI]], <16 x i8> splat (i8 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[PREDPHI3]], i32 15
+; CHECK-NEXT:    store i8 [[TMP11]], ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 128)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER1:%.*]]
 ; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[EXT_IV:%.*]] = zext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[ADD_IV:%.*]] = add i64 [[EXT_IV]], 8
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 [[ADD_IV]], [[CONV5_I:%.*]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 [[ADD_IV]], [[CONV5_I]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_1:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       then.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[IV]], [[SIZE:%.*]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[IV]], [[SIZE]]
 ; CHECK-NEXT:    br i1 [[C_2]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       then.2:
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SIZE]], [[IV]]
 ; CHECK-NEXT:    [[TRUNC_OR:%.*]] = trunc i32 [[OR]] to i8
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[SINK:%.*]] = phi i8 [ [[TRUNC_OR]], [[THEN_2]] ], [ 1, [[LOOP_HEADER]] ], [ 0, [[THEN_1]] ]
-; CHECK-NEXT:    store i8 [[SINK]], ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[SINK:%.*]] = phi i8 [ [[TRUNC_OR]], [[THEN_2]] ], [ 1, [[LOOP_HEADER1]] ], [ 0, [[THEN_1]] ]
+; CHECK-NEXT:    store i8 [[SINK]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = trunc i64 [[ADD_IV]] to i32
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i32 [[SIZE]], [[IV]]
-; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 1808e80a97060..6dc5813720a89 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1073,27 +1073,49 @@ for.end:                                          ; preds = %for.inc
 define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX1-LABEL: define void @foo6(
 ; AVX1-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; AVX1-NEXT:  [[ENTRY:.*]]:
+; AVX1-NEXT:  [[ENTRY:.*:]]
 ; AVX1-NEXT:    br label %[[FOR_BODY:.*]]
 ; AVX1:       [[FOR_BODY]]:
-; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 4095, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
-; AVX1-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
-; AVX1:       [[IF_THEN]]:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
-; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store double [[ADD]], ptr [[ARRAYIDX5]], align 8
-; AVX1-NEXT:    br label %[[FOR_INC]]
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 32768
+; AVX1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER]], i64 16384
+; AVX1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN]], i64 32768
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
+; AVX1-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
+; AVX1-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX1-NEXT:    [[CMP1:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; AVX1-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC:.*]]
 ; AVX1:       [[FOR_INC]]:
-; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; AVX1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX1-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; AVX1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; AVX1:       [[VECTOR_BODY]]:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_INC]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 -3
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META18:![0-9]+]]
+; AVX1-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[IN]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i64 0
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP5]], i64 -3
+; AVX1-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP6]], <4 x i1> [[REVERSE6]], <4 x double> poison), !alias.scope [[META21:![0-9]+]]
+; AVX1-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[REVERSE7]], splat (double 5.000000e-01)
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[OUT]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP8]], i64 0
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[TMP9]], i64 -3
+; AVX1-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE8]], ptr align 8 [[TMP10]], <4 x i1> [[REVERSE6]]), !alias.scope [[META23:![0-9]+]], !noalias [[META25:![0-9]+]]
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX1-NEXT:    br i1 [[TMP11]], label %[[FOR_END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; AVX1:       [[FOR_END]]:
-; AVX1-NEXT:    ret void
+; AVX1-NEXT:    br [[FOR_END1:label %.*]]
+; AVX1:       [[IF_THEN]]:
 ;
 ; AVX2-LABEL: define void @foo6(
 ; AVX2-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -1373,13 +1395,13 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; AVX1:       [[MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AVX1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19:![0-9]+]]
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_PH]]:
 ; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX1-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
@@ -1399,7 +1421,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
@@ -1694,13 +1716,13 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; AVX1:       [[MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AVX1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19]]
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29]]
 ; AVX1:       [[VEC_EPILOG_PH]]:
 ; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX1-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
@@ -1720,7 +1742,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index 5a396f88b1a64..75a0623a2e0d3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -288,9 +288,46 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch_all_dests_distinct(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; COST-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP8]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP6]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -312,7 +349,7 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -469,7 +506,7 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; COST-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; COST-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; COST:       [[MIDDLE_BLOCK]]:
 ; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -502,7 +539,7 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -639,9 +676,49 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch_multiple_common_dests(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 14)
+; COST-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 15)
+; COST-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP9]]
+; COST-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP10]]
+; COST-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP11]], [[TMP13]]
+; COST-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true)
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -662,7 +739,7 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -790,9 +867,43 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch4_default_common_dest_with_case(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP6]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -811,7 +922,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -957,7 +1068,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; COST:       [[MIDDLE_BLOCK]]:
 ; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -987,7 +1098,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -1116,9 +1227,51 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-LABEL: define void @br_under_switch_default_common_dest_with_case(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; COST-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
+; COST-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; COST-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
+; COST-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; COST-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP7]]
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
+; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
+; COST-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP9]]
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -1140,7 +1293,7 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP17:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -1433,6 +1586,14 @@ exit:
 ; COST: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ; COST: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
 ; COST: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; COST: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; COST: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; COST: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; COST: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; COST: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; COST: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; COST: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; COST: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
 ; FORCED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; FORCED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 611a097024134..53c6dccddc710 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -8,26 +8,49 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK-LABEL: define void @smax_call_uniform(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 -68, -69
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ELSE:.*]], label %[[LOOP_LATCH:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[MUL]], [[X]]
-; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REM]], i64 0)
 ; CHECK-NEXT:    br label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = phi i64 [ 1, %[[LOOP_HEADER]] ], [ [[SMAX]], %[[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
+; CHECK:       [[PRED_UREM_IF1]]:
+; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE2]]
+; CHECK:       [[PRED_UREM_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
+; CHECK:       [[PRED_UREM_IF3]]:
+; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE4]]
+; CHECK:       [[PRED_UREM_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
+; CHECK:       [[PRED_UREM_IF5]]:
+; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE6]]
+; CHECK:       [[PRED_UREM_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0)
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
 ; CHECK-NEXT:    store i64 0, ptr [[TMP19]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    store i64 0, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -58,3 +81,8 @@ exit:
 }
 
 declare i64 @llvm.smax.i64(i64, i64)
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index dcfebe32302be..d8ce1d9be901e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -12,8 +12,86 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX-NEXT:  entry:
 ; AVX-NEXT:    [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]]
 ; AVX-NEXT:    br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12:%.*]]
+; AVX:       iter.check:
+; AVX-NEXT:    [[END3:%.*]] = ptrtoint ptr [[END]] to i64
+; AVX-NEXT:    [[START4:%.*]] = ptrtoint ptr [[START]] to i64
+; AVX-NEXT:    [[TMP0:%.*]] = add i64 [[END3]], -4
+; AVX-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START4]]
+; AVX-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
+; AVX-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 28
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; AVX:       vector.main.loop.iter.check:
+; AVX-NEXT:    [[MIN_ITERS_CHECK5:%.*]] = icmp ult i64 [[TMP1]], 124
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK5]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP3]], 24
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 2
+; AVX-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4
+; AVX-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; AVX-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
+; AVX-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; AVX-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], splat (i32 -12)
+; AVX-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD6]], splat (i32 -12)
+; AVX-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD7]], splat (i32 -12)
+; AVX-NEXT:    [[TMP11:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], splat (i32 -12)
+; AVX-NEXT:    [[TMP12:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], splat (i32 13)
+; AVX-NEXT:    [[TMP13:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD6]], splat (i32 13)
+; AVX-NEXT:    [[TMP14:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD7]], splat (i32 13)
+; AVX-NEXT:    [[TMP15:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], splat (i32 13)
+; AVX-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP8]], [[TMP12]]
+; AVX-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]]
+; AVX-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]]
+; AVX-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]]
+; AVX-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP]], <8 x i1> [[TMP16]])
+; AVX-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP5]], <8 x i1> [[TMP17]])
+; AVX-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP6]], <8 x i1> [[TMP18]])
+; AVX-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP7]], <8 x i1> [[TMP19]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; AVX-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; AVX:       vec.epilog.iter.check:
+; AVX-NEXT:    [[TMP21:%.*]] = shl i64 [[N_VEC]], 2
+; AVX-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]]
+; AVX-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; AVX-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; AVX:       vec.epilog.ph:
+; AVX-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AVX-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
+; AVX-NEXT:    [[TMP22:%.*]] = shl i64 [[N_VEC10]], 2
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP22]]
+; AVX-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; AVX:       vec.epilog.vector.body:
+; AVX-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX11]], 2
+; AVX-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; AVX-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[NEXT_GEP12]], align 4
+; AVX-NEXT:    [[TMP24:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD13]], splat (i32 -12)
+; AVX-NEXT:    [[TMP25:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD13]], splat (i32 13)
+; AVX-NEXT:    [[TMP26:%.*]] = or <8 x i1> [[TMP24]], [[TMP25]]
+; AVX-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP12]], <8 x i1> [[TMP26]])
+; AVX-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
+; AVX-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
+; AVX-NEXT:    br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX:       vec.epilog.middle.block:
+; AVX-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
+; AVX-NEXT:    br i1 [[CMP_N15]], label [[EXIT]], label [[BB12_PREHEADER]]
+; AVX:       bb12.preheader:
+; AVX-NEXT:    [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[BB13:%.*]]
 ; AVX:       bb12:
-; AVX-NEXT:    [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[START]], [[ENTRY:%.*]] ]
+; AVX-NEXT:    [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER]] ]
 ; AVX-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4
 ; AVX-NEXT:    switch i32 [[VAL]], label [[LATCH]] [
 ; AVX-NEXT:      i32 -12, label [[STORE:%.*]]
@@ -25,7 +103,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX:       latch:
 ; AVX-NEXT:    [[PTR_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR2]], i64 4
 ; AVX-NEXT:    [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]]
-; AVX-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]]
+; AVX-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX:       exit:
 ; AVX-NEXT:    ret void
 ;

>From dbc63b8065a5797a527e4c0227e70d1616e032dc Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 4 Feb 2026 16:29:10 -0800
Subject: [PATCH 03/15] [Support] Optimzie InstructionCost for compile time

---
 llvm/include/llvm/Support/InstructionCost.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index a8237694050f9..f6cc41ebb61ef 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -65,6 +65,8 @@ class InstructionCost {
 
   static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
   static constexpr CostType MinValue = std::numeric_limits<CostType>::min();
+  static constexpr CostType MaxInputValue = MaxValue / ScalingFactor;
+  static constexpr CostType MinInputValue = MinValue / ScalingFactor;
 
 public:
   // A default constructed InstructionCost is a valid zero cost
@@ -72,14 +74,13 @@ class InstructionCost {
 
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(), State(Valid) {
-    InstructionCost::CostType Result;
-    if (MulOverflow(Val, ScalingFactor, Result)) {
-      if (Val > 0)
-        Result = MaxValue;
-      else
-        Result = MinValue;
-    }
-    Value = Result;
+    if (Val > MaxInputValue) [[unlikely]]
+      Val = MaxValue;
+    else if (Val < MinInputValue) [[unlikely]]
+      Val = MinValue;
+    else [[likely]]
+      Val *= ScalingFactor;
+    Value = Val;
   }
 
   static InstructionCost getMax() { return MaxValue; }
@@ -184,8 +185,7 @@ class InstructionCost {
   }
 
   InstructionCost &operator/=(const CostType RHS) {
-    InstructionCost RHS2(RHS);
-    *this /= RHS2;
+    Value /= RHS;
     return *this;
   }
 

>From f7ceb864113ea51631f8603be3a5087ca4f93988 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Thu, 5 Feb 2026 17:26:11 -0800
Subject: [PATCH 04/15] [utils] Adjust InstructionCost::ScalingFactor to 4 to
 optimize for compile time

Multiplies and divides can be just shifted now.
---
 llvm/include/llvm/Support/InstructionCost.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index f6cc41ebb61ef..0b26e22340e0f 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -59,9 +59,8 @@ class InstructionCost {
       State = Invalid;
   }
 
-  // 120 chosen since least common factor of 2, 3, 4, 5, 6, 8
-  // which are realistic issue widths
-  static constexpr CostType ScalingFactor = 120;
+  // Matches GCC, can use shift rather than multiply/divide to scale
+  static constexpr CostType ScalingFactor = 4;
 
   static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
   static constexpr CostType MinValue = std::numeric_limits<CostType>::min();

>From 3e8bab67184e4d64a7b8dc2da206b7bb8339e5be Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Thu, 5 Feb 2026 22:09:09 -0800
Subject: [PATCH 05/15] Update test for ScalingFactor change

---
 .../LoopCacheAnalysis/interchange-refcost-overflow.ll       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index 88f265872902e..e6416605bdb30 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 76861433640456465
-; CHECK: Loop 'middle.loop' has cost = 76861433640456465
-; CHECK: Loop 'inner.loop' has cost = 76861433640456465
+; CHECK: Loop 'outer.loop' has cost = 2305843009213693951
+; CHECK: Loop 'middle.loop' has cost = 2305843009213693951
+; CHECK: Loop 'inner.loop' has cost = 2305843009213693951
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 

>From b39bde9a95b507eb9f5c93cdd8d36b9c28cca58b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Feb 2026 13:38:52 -0800
Subject: [PATCH 06/15] Remove compiler branch hints since not supported by
 c++17

---
 llvm/include/llvm/Support/InstructionCost.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index 0b26e22340e0f..f725612c4aaf3 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -73,11 +73,11 @@ class InstructionCost {
 
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(), State(Valid) {
-    if (Val > MaxInputValue) [[unlikely]]
+    if (Val > MaxInputValue)
       Val = MaxValue;
-    else if (Val < MinInputValue) [[unlikely]]
+    else if (Val < MinInputValue)
       Val = MinValue;
-    else [[likely]]
+    else
       Val *= ScalingFactor;
     Value = Val;
   }

>From 3dc22c2f8b69b697738613d2da94538a089ce5e6 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Feb 2026 13:39:10 -0800
Subject: [PATCH 07/15] Update PowerPC test

---
 .../Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index 5209d290c83da..0cad33a04f39c 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,7 +83,7 @@ for.end13:                                        ; preds = %for.inc11
 
 declare ptr @func_with_returned_arg(ptr returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 2112128815104000000
+; CHECK: Loop 'for.body' has cost = 2305843009213693951
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000

>From 7d0a595445ac1834f5992cf9c1108f1e9d6914b8 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 25 Feb 2026 11:35:19 -0800
Subject: [PATCH 08/15] Revert "[Support] Optimzie InstructionCost for compile
 time"

This reverts commit dbc63b8065a5797a527e4c0227e70d1616e032dc.
---
 llvm/include/llvm/Support/InstructionCost.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index f725612c4aaf3..c6e0f7ca38863 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -64,8 +64,6 @@ class InstructionCost {
 
   static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
   static constexpr CostType MinValue = std::numeric_limits<CostType>::min();
-  static constexpr CostType MaxInputValue = MaxValue / ScalingFactor;
-  static constexpr CostType MinInputValue = MinValue / ScalingFactor;
 
 public:
   // A default constructed InstructionCost is a valid zero cost
@@ -73,13 +71,14 @@ class InstructionCost {
 
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(), State(Valid) {
-    if (Val > MaxInputValue)
-      Val = MaxValue;
-    else if (Val < MinInputValue)
-      Val = MinValue;
-    else
-      Val *= ScalingFactor;
-    Value = Val;
+    InstructionCost::CostType Result;
+    if (MulOverflow(Val, ScalingFactor, Result)) {
+      if (Val > 0)
+        Result = MaxValue;
+      else
+        Result = MinValue;
+    }
+    Value = Result;
   }
 
   static InstructionCost getMax() { return MaxValue; }

>From 975c04a5c97476ef69485b2d6006aea5bfb6dc3b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 25 Feb 2026 11:47:58 -0800
Subject: [PATCH 09/15] [Support] Update InstructionCost::print() to report the
 raw value and scaling factor

Was reporting the rounded approximation before.
---
 llvm/lib/Support/InstructionCost.cpp          |   9 +-
 .../PowerPC/LoopnestFixedSize.ll              |   2 +-
 .../interchange-refcost-overflow.ll           |   6 +-
 .../LoopVectorize/ARM/scalar-block-cost.ll    |   2 +-
 .../WebAssembly/memory-interleave.ll          | 118 +++++++++---------
 .../interleaved-load-f32-stride-3.ll          |   4 +-
 .../interleaved-load-f32-stride-5.ll          |   4 +-
 .../interleaved-load-f32-stride-7.ll          |   6 +-
 .../interleaved-load-f64-stride-3.ll          |   4 +-
 .../interleaved-load-f64-stride-5.ll          |   4 +-
 .../interleaved-load-f64-stride-7.ll          |   6 +-
 .../interleaved-load-i16-stride-3.ll          |   4 +-
 .../interleaved-load-i16-stride-5.ll          |   4 +-
 .../interleaved-load-i16-stride-7.ll          |   6 +-
 .../interleaved-load-i32-stride-3.ll          |   4 +-
 ...erleaved-load-i32-stride-4-indices-012u.ll |   6 +-
 .../interleaved-load-i32-stride-5.ll          |   4 +-
 .../interleaved-load-i32-stride-7.ll          |   6 +-
 .../interleaved-load-i64-stride-3.ll          |   4 +-
 .../interleaved-load-i64-stride-5.ll          |   4 +-
 .../interleaved-load-i64-stride-7.ll          |   6 +-
 .../CostModel/interleaved-load-i8-stride-5.ll |   4 +-
 .../CostModel/interleaved-load-i8-stride-7.ll |   6 +-
 .../CostModel/masked-interleaved-store-i16.ll |   6 +-
 .../masked-scatter-i32-with-i8-index.ll       |  12 +-
 .../masked-scatter-i64-with-i8-index.ll       |   9 +-
 .../X86/CostModel/masked-store-i16.ll         |   6 +-
 .../X86/CostModel/masked-store-i32.ll         |   6 +-
 .../X86/CostModel/masked-store-i64.ll         |   4 +-
 .../X86/CostModel/masked-store-i8.ll          |  14 +--
 30 files changed, 142 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index 6b4eb6d2f1ed6..36ea2a5bed925 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -17,8 +17,11 @@
 using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
-  if (isValid())
-    OS << (Value / ScalingFactor);
-  else
+  if (isValid()) {
+    if (Value % ScalingFactor)
+      OS << Value << "/" << ScalingFactor;
+    else
+      OS << (Value / ScalingFactor);
+  } else
     OS << "Invalid";
 }
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index 0cad33a04f39c..d8e67acf6b48c 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,7 +83,7 @@ for.end13:                                        ; preds = %for.inc11
 
 declare ptr @func_with_returned_arg(ptr returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 2305843009213693951
+; CHECK: Loop 'for.body' has cost = 9223372036854775807/4
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index e6416605bdb30..edd3246bdb019 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 2305843009213693951
-; CHECK: Loop 'middle.loop' has cost = 2305843009213693951
-; CHECK: Loop 'inner.loop' has cost = 2305843009213693951
+; CHECK: Loop 'outer.loop' has cost = 9223372036854775807/4
+; CHECK: Loop 'middle.loop' has cost = 9223372036854775807/4
+; CHECK: Loop 'inner.loop' has cost = 9223372036854775807/4
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index d0c11946c9deb..0d26e20bb0e80 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -57,7 +57,7 @@ define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nsw i32 %i.032, 1
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %end
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.
+; CHECK-COST-2-NEXT: LV: Scalar loop costs: 34/4.
 
 entry:
   %cmp31 = icmp slt i32 %start, %end
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index 54cbab78b1e29..e64b43c13b110 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -22,7 +22,7 @@ target triple = "wasm32-unknown-wasi"
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 13.
+; CHECK: LV: Vector loop of width 2 costs: 54/4.
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -57,7 +57,7 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 13.
+; CHECK: LV: Vector loop of width 2 costs: 54/4.
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -93,11 +93,11 @@ define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 2 costs: 30.
+; CHECK: LV: Vector loop of width 2 costs: 122/4.
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 4 costs: 28.
+; CHECK: LV: Vector loop of width 4 costs: 115/4.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -139,15 +139,15 @@ define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr n
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 2 costs: 30.
+; CHECK: LV: Vector loop of width 2 costs: 122/4.
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 4 costs: 28.
+; CHECK: LV: Vector loop of width 4 costs: 115/4.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.
+; CHECK: LV: Vector loop of width 8 costs: 111/4.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -197,11 +197,11 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 26
+; CHECK: LV: Vector loop of width 8 costs: 106/4
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -257,11 +257,11 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 8 costs: 26.
+; CHECK: LV: Vector loop of width 8 costs: 106/4.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -317,11 +317,11 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 26.
+; CHECK: LV: Vector loop of width 8 costs: 106/4.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -371,7 +371,7 @@ define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %37
-; CHECK: LV: Vector loop of width 8 costs: 32
+; CHECK: LV: Vector loop of width 8 costs: 130/4
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: store i8 %37
@@ -438,7 +438,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 61/4.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
@@ -446,7 +446,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 1.
+; CHECK: LV: Vector loop of width 16 costs: 7/4.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -484,19 +484,19 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %13
-; CHECK: LV: Vector loop of width 2 costs: 23.
+; CHECK: LV: Vector loop of width 2 costs: 94/4.
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 12.
+; CHECK: LV: Vector loop of width 4 costs: 50/4.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 3.
+; CHECK: LV: Vector loop of width 8 costs: 15/4.
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %19
-; CHECK: LV: Vector loop of width 16 costs: 1.
+; CHECK: LV: Vector loop of width 16 costs: 6/4.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -528,16 +528,16 @@ define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0
 
 ; CHECK-LABEL: three_bytes_same_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 30.
-; CHECK: LV: Vector loop of width 4 costs: 28.
+; CHECK: LV: Vector loop of width 2 costs: 122/4.
+; CHECK: LV: Vector loop of width 4 costs: 115/4.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.
+; CHECK: LV: Vector loop of width 8 costs: 111/4.
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 27.
+; CHECK: LV: Vector loop of width 16 costs: 109/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -576,16 +576,16 @@ define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly
 
 ; CHECK-LABEL: three_bytes_interleave_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 30.
-; CHECK: LV: Vector loop of width 4 costs: 28.
+; CHECK: LV: Vector loop of width 2 costs: 122/4.
+; CHECK: LV: Vector loop of width 4 costs: 115/4.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.
+; CHECK: LV: Vector loop of width 8 costs: 111/4.
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 27.
+; CHECK: LV: Vector loop of width 16 costs: 109/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -631,15 +631,15 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 10.
+; CHECK: LV: Vector loop of width 8 costs: 43/4.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25.
+; CHECK: LV: Vector loop of width 16 costs: 101/4.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -698,12 +698,12 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 11.
+; CHECK: LV: Vector loop of width 8 costs: 46/4.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25
+; CHECK: LV: Vector loop of width 16 costs: 102/4
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -757,15 +757,15 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15
+; CHECK: LV: Vector loop of width 4 costs: 62/4
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 10
+; CHECK: LV: Vector loop of width 8 costs: 43/4
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25
+; CHECK: LV: Vector loop of width 16 costs: 101/4
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -817,7 +817,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 4 costs: 74
+; CHECK: LV: Vector loop of width 4 costs: 298/4
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %55
@@ -825,7 +825,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 51
+; CHECK: LV: Vector loop of width 16 costs: 207/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -901,7 +901,7 @@ define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 50
+; CHECK: LV: Vector loop of width 16 costs: 201/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -977,7 +977,7 @@ define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 50
+; CHECK: LV: Vector loop of width 16 costs: 201/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1126,7 +1126,7 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
-; CHECK: LV: Vector loop of width 2 costs: 35.
+; CHECK: LV: Vector loop of width 2 costs: 142/4.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:  %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
@@ -1184,8 +1184,8 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK-LABEL: scale_uv_row_down2
 ; CHECK: LV: Scalar loop costs: 10.
 ; CHECK: LV: Vector loop of width 2 costs: 13.
-; CHECK: LV: Vector loop of width 4 costs: 8.
-; CHECK: LV: Vector loop of width 8 costs: 4.
+; CHECK: LV: Vector loop of width 4 costs: 35/4.
+; CHECK: LV: Vector loop of width 8 costs: 19/4.
 ; CHECK: LV: Vector loop of width 16 costs: 5.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
@@ -1219,7 +1219,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 4 costs: 18.
+; CHECK: LV: Vector loop of width 4 costs: 73/4.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
@@ -1299,12 +1299,12 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: LV: Vector loop of width 2 costs: 25.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 4 costs: 47/4.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 6.
+; CHECK: LV: Vector loop of width 8 costs: 27/4.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
-; CHECK: LV: Vector loop of width 16 costs: 10.
+; CHECK: LV: Vector loop of width 16 costs: 43/4.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1356,7 +1356,7 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1397,7 +1397,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Vector loop of width 4 costs: 62/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1519,7 +1519,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Vector loop of width 4 costs: 67/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1559,7 +1559,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Vector loop of width 4 costs: 67/4.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1688,8 +1688,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Vector loop of width 2 costs: 94/4
+; CHECK: LV: Vector loop of width 4 costs: 59/4
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1730,8 +1730,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Vector loop of width 2 costs: 94/4
+; CHECK: LV: Vector loop of width 4 costs: 59/4
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1997,7 +1997,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Vector loop of width 4 costs: 126/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2054,7 +2054,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Vector loop of width 4 costs: 126/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2236,7 +2236,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Vector loop of width 4 costs: 118/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2294,7 +2294,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Vector loop of width 4 costs: 118/4
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
index a6f190bd5eb96..5dc29c599851e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
index d8aadbe04b72f..dab94232f2ceb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -124,7 +124,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 38/4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
index d8dc847535154..2b63f769b9add 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -166,14 +166,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 50/4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 142/4 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
index 868a9c2ab62cf..1ce36c309926c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -82,7 +82,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 26/4 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
index d5d6be704b757..9939cee37ac36 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -109,7 +109,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 38/4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
index a5a6a1e155d50..a8ec595bc4df5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -145,14 +145,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 50/4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 142/4 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
index f87927cffc3c9..9542c89a54ab3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -55,7 +55,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 38/4 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
index 0124a86a1548c..05607f0d55e4c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -182,7 +182,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX512BW:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 58/4 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
index 392a677c95e3e..8455363b1af33 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -246,14 +246,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 19 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 78/4 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 226/4 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
index cee1dc84445b5..04ee20385520d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index 0f1e265a5c7de..c16c693080327 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,8 +43,8 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 17 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 70/4 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
index 0b2f091221319..53452dd6b2ad1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -124,7 +124,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 38/4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
index 91bb334817c2d..0060e5a708f6c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -166,14 +166,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 50/4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 142/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
index d8c64e3c17357..fc9b9a330e78f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -82,7 +82,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 26/4 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
index d1a51ae3f779d..0718b4da81166 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -99,7 +99,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 38/4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
index 73429492d47e8..6fb69a6456e21 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -145,14 +145,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 50/4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 142/4 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
index cce8887ad2447..103e3aee94f61 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -187,7 +187,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX512BW:  LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 398/4 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
index e8a2637b9f5b4..9527618904ced 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -253,14 +253,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 554/4 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 413 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1654/4 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index c2c04ce6f5ff5..3b9e14163aeaa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2"
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=DISABLED_MASKED_STRIDED
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=ENABLED_MASKED_STRIDED
 ; REQUIRES: asserts
@@ -157,7 +157,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
@@ -165,7 +165,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
index 5e67bd57754e4..e8e75ba3383cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,8 +18,8 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -34,7 +34,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX1:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -42,14 +42,14 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX2:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 42/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
index faa2aa43d4934..c3b5fea5d650f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
@@ -32,8 +32,9 @@ define void @test() {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX1:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
@@ -41,7 +42,7 @@ define void @test() {
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
index 1d51a32a520a9..5c53ebf358e32 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %valB, ptr %out, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i16 %valB, ptr %out, align 2"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -27,7 +27,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX1:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX1:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX2-LABEL: 'test'
@@ -35,7 +35,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX2:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX512-LABEL: 'test'
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
index f011d06d319bb..d530337f5c4c1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,8 +17,8 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
index c004b16ae207d..efa2d0ef0d5fe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,7 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
index 8bbe624849783..d880b7c464316 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %valB, ptr %out, align 1"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i8 %valB, ptr %out, align 1"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,10 +17,10 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 23 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 46/4 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 94/4 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -35,7 +35,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX1:  LV: Found an estimated cost of 32 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX1:  LV: Found an estimated cost of 130/4 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -43,7 +43,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX2:  LV: Found an estimated cost of 32 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX2:  LV: Found an estimated cost of 130/4 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1

>From 443a6edbaa1a3ea529e562f445287629000c1810 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Mar 2026 15:50:39 -0700
Subject: [PATCH 10/15] Run update_analyze_test_checks on various tests

These tests have not been update recently and are missing check lines.
---
 .../interleaved-load-f32-stride-5.ll          | 10 ++++++
 .../interleaved-load-f32-stride-7.ll          | 21 +++++++++++
 .../interleaved-load-f64-stride-3.ll          |  9 +++++
 .../interleaved-load-f64-stride-5.ll          | 20 +++++++++++
 .../interleaved-load-f64-stride-7.ll          | 35 +++++++++++++++++++
 ...erleaved-load-i32-stride-4-indices-012u.ll |  1 +
 .../interleaved-load-i32-stride-5.ll          | 10 ++++++
 .../interleaved-load-i32-stride-7.ll          | 21 +++++++++++
 .../interleaved-load-i64-stride-3.ll          |  9 +++++
 .../interleaved-load-i64-stride-5.ll          | 30 ++++++++++++++++
 .../interleaved-load-i64-stride-7.ll          | 35 +++++++++++++++++++
 .../masked-scatter-i32-with-i8-index.ll       |  5 +++
 .../masked-scatter-i64-with-i8-index.ll       | 30 +++++++++++-----
 .../X86/CostModel/masked-store-i16.ll         |  4 +++
 .../X86/CostModel/masked-store-i32.ll         |  5 +++
 .../X86/CostModel/masked-store-i64.ll         |  5 +++
 .../X86/CostModel/masked-store-i8.ll          |  5 +++
 17 files changed, 246 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
index dab94232f2ceb..59ea30903294e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
@@ -38,6 +38,11 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -70,6 +75,11 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
index 2b63f769b9add..3ba523f50cc34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
@@ -48,6 +48,13 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
 ; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -92,6 +99,13 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -187,6 +201,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
index 1ce36c309926c..c72235375b970 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
@@ -28,6 +28,9 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -48,6 +51,9 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -94,6 +100,9 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
index 9939cee37ac36..46c64c289e7c4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
@@ -33,6 +33,16 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -60,6 +70,16 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
index a8ec595bc4df5..c26a732e01bac 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
@@ -41,6 +41,20 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -78,6 +92,20 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -173,6 +201,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index c16c693080327..3a60a5be7b281 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -46,6 +46,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 70/4 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
index 53452dd6b2ad1..19d130cb8407c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
@@ -38,6 +38,11 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -70,6 +75,11 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
index 0060e5a708f6c..12e7f37b8f043 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
@@ -48,6 +48,13 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -92,6 +99,13 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -187,6 +201,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
index fc9b9a330e78f..0013efda39e91 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
@@ -28,6 +28,9 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -48,6 +51,9 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -94,6 +100,9 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
index 0718b4da81166..4edb3959a524b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
@@ -33,6 +33,16 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -60,6 +70,16 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -87,6 +107,16 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
index 6fb69a6456e21..2fbfc8fd9f746 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
@@ -41,6 +41,20 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -78,6 +92,20 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 36 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX1:  LV: Found an estimated cost of 72 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -173,6 +201,13 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
index e8e75ba3383cb..c1f04dbac2d82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
@@ -18,6 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -25,6 +26,7 @@ define void @test() {
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -32,6 +34,7 @@ define void @test() {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -40,6 +43,7 @@ define void @test() {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -48,6 +52,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 42/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
index c3b5fea5d650f..a1bbc4006eafa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
@@ -2,8 +2,8 @@
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2,AVX2-NOFAST
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,+fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2-NOFAST
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,+fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2-FAST
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX512
 
 ; REQUIRES: asserts
@@ -18,6 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -25,6 +26,7 @@ define void @test() {
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -39,16 +41,26 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
 ;
-; AVX2-LABEL: 'test'
-; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2-NOFAST: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST-LABEL: 'test'
+; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
+;
+; AVX2-FAST-LABEL: 'test'
+; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
index 5c53ebf358e32..4aaf4fee8e56d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
@@ -17,6 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
+; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
 ; SSE:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; SSE:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; SSE:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
@@ -24,6 +25,7 @@ define void @test(ptr %C) {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
@@ -32,6 +34,7 @@ define void @test(ptr %C) {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
@@ -40,6 +43,7 @@ define void @test(ptr %C) {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
index d530337f5c4c1..4c9d4d68d07e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
@@ -17,6 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -24,6 +25,7 @@ define void @test(ptr %C) {
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE42:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -31,6 +33,7 @@ define void @test(ptr %C) {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -39,6 +42,7 @@ define void @test(ptr %C) {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
@@ -47,6 +51,7 @@ define void @test(ptr %C) {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
index efa2d0ef0d5fe..3592139a79f6a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
@@ -17,6 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -24,6 +25,7 @@ define void @test(ptr %C) {
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE42:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -31,6 +33,7 @@ define void @test(ptr %C) {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -39,6 +42,7 @@ define void @test(ptr %C) {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
@@ -47,6 +51,7 @@ define void @test(ptr %C) {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
index d880b7c464316..0b96adabd8de3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
@@ -17,6 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE2:  LV: Found an estimated cost of 46/4 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
@@ -24,6 +25,7 @@ define void @test(ptr %C) {
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
+; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE42:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE42:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE42:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
@@ -31,6 +33,7 @@ define void @test(ptr %C) {
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
@@ -39,6 +42,7 @@ define void @test(ptr %C) {
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
@@ -47,6 +51,7 @@ define void @test(ptr %C) {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %valB, ptr %out, align 1

>From 1a6237e2c2258bd758e609fe2f12aac6b9ea9920 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Mar 2026 10:38:19 -0700
Subject: [PATCH 11/15] [Support] ScalingFactor -> CostGranularity

---
 llvm/include/llvm/Support/InstructionCost.h | 10 +++++-----
 llvm/lib/Support/InstructionCost.cpp        |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index c6e0f7ca38863..eb7b03baa1a8e 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -60,7 +60,7 @@ class InstructionCost {
   }
 
   // Matches GCC, can use shift rather than multiply/divide to scale
-  static constexpr CostType ScalingFactor = 4;
+  static constexpr CostType CostGranularity = 4;
 
   static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
   static constexpr CostType MinValue = std::numeric_limits<CostType>::min();
@@ -72,7 +72,7 @@ class InstructionCost {
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(), State(Valid) {
     InstructionCost::CostType Result;
-    if (MulOverflow(Val, ScalingFactor, Result)) {
+    if (MulOverflow(Val, CostGranularity, Result)) {
       if (Val > 0)
         Result = MaxValue;
       else
@@ -99,7 +99,7 @@ class InstructionCost {
   /// and comparisons.
   CostType getValue() const {
     assert(isValid());
-    return Value / ScalingFactor;
+    return Value / CostGranularity;
   }
 
   /// For all of the arithmetic operators provided here any invalid state is
@@ -154,7 +154,7 @@ class InstructionCost {
       else
         Result = MinValue;
     } else {
-      Result /= ScalingFactor;
+      Result /= CostGranularity;
     }
 
     Value = Result;
@@ -171,7 +171,7 @@ class InstructionCost {
     propagateState(RHS);
     // Saturating multiply.
     InstructionCost::CostType Result;
-    if (MulOverflow(Value, ScalingFactor, Result)) {
+    if (MulOverflow(Value, CostGranularity, Result)) {
       if (Value > 0)
         Result = MaxValue;
       else
diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index 36ea2a5bed925..7a061277d11db 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -18,10 +18,10 @@ using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
   if (isValid()) {
-    if (Value % ScalingFactor)
-      OS << Value << "/" << ScalingFactor;
+    if (Value % CostGranularity)
+      OS << Value << "/" << CostGranularity;
     else
-      OS << (Value / ScalingFactor);
+      OS << (Value / CostGranularity);
   } else
     OS << "Invalid";
 }

>From f4f38c98b9b30decfbc85bd3f813b50d5ceb3d5c Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Mar 2026 15:31:53 -0700
Subject: [PATCH 12/15] [Support] Use ternary operator where appropriate

---
 llvm/include/llvm/Support/InstructionCost.h | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index eb7b03baa1a8e..5f8c64b39057a 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -72,12 +72,8 @@ class InstructionCost {
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(), State(Valid) {
     InstructionCost::CostType Result;
-    if (MulOverflow(Val, CostGranularity, Result)) {
-      if (Val > 0)
-        Result = MaxValue;
-      else
-        Result = MinValue;
-    }
+    if (MulOverflow(Val, CostGranularity, Result))
+      Result = Val > 0 ? MaxValue : MinValue;
     Value = Result;
   }
 
@@ -171,12 +167,8 @@ class InstructionCost {
     propagateState(RHS);
     // Saturating multiply.
     InstructionCost::CostType Result;
-    if (MulOverflow(Value, CostGranularity, Result)) {
-      if (Value > 0)
-        Result = MaxValue;
-      else
-        Result = MinValue;
-    }
+    if (MulOverflow(Value, CostGranularity, Result))
+      Result = Value > 0 ? MaxValue : MinValue;
     Result /= RHS.Value;
     Value = Result;
     return *this;

>From da76098394c7ca7072d04c218791a543a4a1bb96 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 9 Mar 2026 10:54:37 -0700
Subject: [PATCH 13/15] [Support] Update formatting to show both decimnal and
 fractional values

---
 llvm/lib/Support/InstructionCost.cpp          |  12 +-
 .../PowerPC/LoopnestFixedSize.ll              |   2 +-
 .../interchange-refcost-overflow.ll           |   6 +-
 .../LoopVectorize/ARM/scalar-block-cost.ll    |   2 +-
 .../WebAssembly/memory-interleave.ll          | 118 +++++++++---------
 .../interleaved-load-f32-stride-3.ll          |   4 +-
 .../interleaved-load-f32-stride-5.ll          |   4 +-
 .../interleaved-load-f32-stride-7.ll          |   6 +-
 .../interleaved-load-f64-stride-3.ll          |   4 +-
 .../interleaved-load-f64-stride-5.ll          |   4 +-
 .../interleaved-load-f64-stride-7.ll          |   6 +-
 .../interleaved-load-i16-stride-3.ll          |   4 +-
 .../interleaved-load-i16-stride-5.ll          |   4 +-
 .../interleaved-load-i16-stride-7.ll          |   6 +-
 .../interleaved-load-i32-stride-3.ll          |   4 +-
 ...erleaved-load-i32-stride-4-indices-012u.ll |   6 +-
 .../interleaved-load-i32-stride-5.ll          |   4 +-
 .../interleaved-load-i32-stride-7.ll          |   6 +-
 .../interleaved-load-i64-stride-3.ll          |   4 +-
 .../interleaved-load-i64-stride-5.ll          |   4 +-
 .../interleaved-load-i64-stride-7.ll          |   6 +-
 .../CostModel/interleaved-load-i8-stride-5.ll |   4 +-
 .../CostModel/interleaved-load-i8-stride-7.ll |   6 +-
 .../CostModel/masked-interleaved-store-i16.ll |   6 +-
 .../masked-scatter-i32-with-i8-index.ll       |  12 +-
 .../masked-scatter-i64-with-i8-index.ll       |  10 +-
 .../X86/CostModel/masked-store-i16.ll         |   6 +-
 .../X86/CostModel/masked-store-i32.ll         |   6 +-
 .../X86/CostModel/masked-store-i64.ll         |   4 +-
 .../X86/CostModel/masked-store-i8.ll          |  14 +--
 30 files changed, 144 insertions(+), 140 deletions(-)

diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index 7a061277d11db..2abd3dce2c133 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -11,6 +11,7 @@
 /// instructions.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Format.h"
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -18,10 +19,13 @@ using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
   if (isValid()) {
-    if (Value % CostGranularity)
-      OS << Value << "/" << CostGranularity;
-    else
+    if (Value % CostGranularity) {
+      double DecimalValue = static_cast<double>(Value) / CostGranularity;
+      OS << format("%.2f", DecimalValue) << " (" << Value << "/" << CostGranularity << ")";
+    } else {
       OS << (Value / CostGranularity);
-  } else
+    }
+  } else {
     OS << "Invalid";
+  }
 }
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index d8e67acf6b48c..8649598f7f7ab 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,7 +83,7 @@ for.end13:                                        ; preds = %for.inc11
 
 declare ptr @func_with_returned_arg(ptr returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 9223372036854775807/4
+; CHECK: Loop 'for.body' has cost = 2305843009213693952.00 (9223372036854775807/4)
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index edd3246bdb019..96b42dc9e5571 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 9223372036854775807/4
-; CHECK: Loop 'middle.loop' has cost = 9223372036854775807/4
-; CHECK: Loop 'inner.loop' has cost = 9223372036854775807/4
+; CHECK: Loop 'outer.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
+; CHECK: Loop 'middle.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
+; CHECK: Loop 'inner.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 0d26e20bb0e80..05064068c79bc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -57,7 +57,7 @@ define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nsw i32 %i.032, 1
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %end
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK-COST-2-NEXT: LV: Scalar loop costs: 34/4.
+; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.50 (34/4).
 
 entry:
   %cmp31 = icmp slt i32 %start, %end
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index e64b43c13b110..b81017047b34a 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -22,7 +22,7 @@ target triple = "wasm32-unknown-wasi"
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 54/4.
+; CHECK: LV: Vector loop of width 2 costs: 13.50 (54/4).
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -57,7 +57,7 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 54/4.
+; CHECK: LV: Vector loop of width 2 costs: 13.50 (54/4).
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -93,11 +93,11 @@ define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 2 costs: 122/4.
+; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 4 costs: 115/4.
+; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -139,15 +139,15 @@ define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr n
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 2 costs: 122/4.
+; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 4 costs: 115/4.
+; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 8 costs: 111/4.
+; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -197,11 +197,11 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 106/4
+; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4)
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -257,11 +257,11 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 8 costs: 106/4.
+; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4).
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -317,11 +317,11 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 106/4.
+; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4).
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -371,7 +371,7 @@ define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %37
-; CHECK: LV: Vector loop of width 8 costs: 130/4
+; CHECK: LV: Vector loop of width 8 costs: 32.50 (130/4)
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: store i8 %37
@@ -438,7 +438,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 61/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.25 (61/4).
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
@@ -446,7 +446,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 7/4.
+; CHECK: LV: Vector loop of width 16 costs: 1.75 (7/4).
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -484,19 +484,19 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %13
-; CHECK: LV: Vector loop of width 2 costs: 94/4.
+; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4).
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 50/4.
+; CHECK: LV: Vector loop of width 4 costs: 12.50 (50/4).
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 15/4.
+; CHECK: LV: Vector loop of width 8 costs: 3.75 (15/4).
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %19
-; CHECK: LV: Vector loop of width 16 costs: 6/4.
+; CHECK: LV: Vector loop of width 16 costs: 1.50 (6/4).
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -528,16 +528,16 @@ define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0
 
 ; CHECK-LABEL: three_bytes_same_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 122/4.
-; CHECK: LV: Vector loop of width 4 costs: 115/4.
+; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
+; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 111/4.
+; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 109/4.
+; CHECK: LV: Vector loop of width 16 costs: 27.25 (109/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -576,16 +576,16 @@ define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly
 
 ; CHECK-LABEL: three_bytes_interleave_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 122/4.
-; CHECK: LV: Vector loop of width 4 costs: 115/4.
+; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
+; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 111/4.
+; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 109/4.
+; CHECK: LV: Vector loop of width 16 costs: 27.25 (109/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -631,15 +631,15 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 43/4.
+; CHECK: LV: Vector loop of width 8 costs: 10.75 (43/4).
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 101/4.
+; CHECK: LV: Vector loop of width 16 costs: 25.25 (101/4).
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -698,12 +698,12 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 46/4.
+; CHECK: LV: Vector loop of width 8 costs: 11.50 (46/4).
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 102/4
+; CHECK: LV: Vector loop of width 16 costs: 25.50 (102/4)
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -757,15 +757,15 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 62/4
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4)
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 43/4
+; CHECK: LV: Vector loop of width 8 costs: 10.75 (43/4)
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 101/4
+; CHECK: LV: Vector loop of width 16 costs: 25.25 (101/4)
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -817,7 +817,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 4 costs: 298/4
+; CHECK: LV: Vector loop of width 4 costs: 74.50 (298/4)
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %55
@@ -825,7 +825,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 207/4
+; CHECK: LV: Vector loop of width 16 costs: 51.75 (207/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -901,7 +901,7 @@ define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 201/4
+; CHECK: LV: Vector loop of width 16 costs: 50.25 (201/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -977,7 +977,7 @@ define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 201/4
+; CHECK: LV: Vector loop of width 16 costs: 50.25 (201/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1126,7 +1126,7 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
-; CHECK: LV: Vector loop of width 2 costs: 142/4.
+; CHECK: LV: Vector loop of width 2 costs: 35.50 (142/4).
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:  %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
@@ -1184,8 +1184,8 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK-LABEL: scale_uv_row_down2
 ; CHECK: LV: Scalar loop costs: 10.
 ; CHECK: LV: Vector loop of width 2 costs: 13.
-; CHECK: LV: Vector loop of width 4 costs: 35/4.
-; CHECK: LV: Vector loop of width 8 costs: 19/4.
+; CHECK: LV: Vector loop of width 4 costs: 8.75 (35/4).
+; CHECK: LV: Vector loop of width 8 costs: 4.75 (19/4).
 ; CHECK: LV: Vector loop of width 16 costs: 5.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
@@ -1219,7 +1219,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 4 costs: 73/4.
+; CHECK: LV: Vector loop of width 4 costs: 18.25 (73/4).
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
@@ -1299,12 +1299,12 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: LV: Vector loop of width 2 costs: 25.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 47/4.
+; CHECK: LV: Vector loop of width 4 costs: 11.75 (47/4).
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 27/4.
+; CHECK: LV: Vector loop of width 8 costs: 6.75 (27/4).
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
-; CHECK: LV: Vector loop of width 16 costs: 43/4.
+; CHECK: LV: Vector loop of width 16 costs: 10.75 (43/4).
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1356,7 +1356,7 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1397,7 +1397,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 62/4.
+; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1519,7 +1519,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 67/4.
+; CHECK: LV: Vector loop of width 4 costs: 16.75 (67/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1559,7 +1559,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 67/4.
+; CHECK: LV: Vector loop of width 4 costs: 16.75 (67/4).
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1688,8 +1688,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 94/4
-; CHECK: LV: Vector loop of width 4 costs: 59/4
+; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4)
+; CHECK: LV: Vector loop of width 4 costs: 14.75 (59/4)
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1730,8 +1730,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 94/4
-; CHECK: LV: Vector loop of width 4 costs: 59/4
+; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4)
+; CHECK: LV: Vector loop of width 4 costs: 14.75 (59/4)
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1997,7 +1997,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 126/4
+; CHECK: LV: Vector loop of width 4 costs: 31.50 (126/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2054,7 +2054,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 126/4
+; CHECK: LV: Vector loop of width 4 costs: 31.50 (126/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2236,7 +2236,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 118/4
+; CHECK: LV: Vector loop of width 4 costs: 29.50 (118/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2294,7 +2294,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 118/4
+; CHECK: LV: Vector loop of width 4 costs: 29.50 (118/4)
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
index 5dc29c599851e..d1332da8d2f07 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
index 59ea30903294e..5bc85746af9ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -134,7 +134,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 38/4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
index 3ba523f50cc34..b2fa902249018 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -180,14 +180,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 50/4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 142/4 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
index c72235375b970..1b3b24b32e7c2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -88,7 +88,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 26/4 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
index 46c64c289e7c4..5413f6bce5427 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -129,7 +129,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 38/4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
index c26a732e01bac..6d62c5ab28e61 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -173,14 +173,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 50/4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 142/4 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
index 9542c89a54ab3..88a4beb8b5170 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -55,7 +55,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 38/4 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 9.50 (38/4) for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
index 05607f0d55e4c..f99d056c707f8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -182,7 +182,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX512BW:  LV: Found an estimated cost of 58/4 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 14.50 (58/4) for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
index 8455363b1af33..0b14e7b637c25 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -246,14 +246,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 78/4 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 19.50 (78/4) for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 226/4 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 56.50 (226/4) for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
index 04ee20385520d..ab7050f9e5a21 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index 3a60a5be7b281..14b3256110df9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,8 +43,8 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 26/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 70/4 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 17.50 (70/4) for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
index 19d130cb8407c..623d37bf8c2c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -134,7 +134,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 38/4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
index 12e7f37b8f043..beeea5b554208 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -180,14 +180,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 50/4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 142/4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
index 0013efda39e91..24a2ef36decf9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -88,7 +88,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 26/4 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
index 4edb3959a524b..07962ca560c20 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -129,7 +129,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 38/4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
index 2fbfc8fd9f746..3964b1d45ae3a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -173,14 +173,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 50/4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 142/4 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
index 103e3aee94f61..2e071f0df74c9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -187,7 +187,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX512BW:  LV: Found an estimated cost of 398/4 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 99.50 (398/4) for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
index 9527618904ced..c2fccc77b7b5e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in."
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -253,14 +253,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 554/4 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 138.50 (554/4) for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 1654/4 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 413.50 (1654/4) for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index 3b9e14163aeaa..e2bc6aded0567 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2"
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=DISABLED_MASKED_STRIDED
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=ENABLED_MASKED_STRIDED
 ; REQUIRES: asserts
@@ -157,7 +157,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
@@ -165,7 +165,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
index c1f04dbac2d82..46b3ff36be17e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -19,8 +19,8 @@ define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -37,7 +37,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX1:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX1:  LV: Found an estimated cost of 8.50 (34/4) for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -46,7 +46,7 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX2:  LV: Found an estimated cost of 34/4 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX2:  LV: Found an estimated cost of 8.50 (34/4) for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -54,7 +54,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; AVX512:  LV: Found an estimated cost of 42/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 10.50 (42/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
index a1bbc4006eafa..6ffedeac86fe9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -19,7 +19,7 @@ define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
@@ -36,7 +36,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX1:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX1:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2-NOFAST:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
@@ -54,7 +54,7 @@ define void @test() {
 ; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2-FAST:  LV: Found an estimated cost of 18/4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
index 4aaf4fee8e56d..225d54e9c4c07 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i16 %valB, ptr %out, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i16 %valB, ptr %out, align 2"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -29,7 +29,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX1:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX1:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX2-LABEL: 'test'
@@ -38,7 +38,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX2:  LV: Found an estimated cost of 66/4 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX2:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX512-LABEL: 'test'
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
index 4c9d4d68d07e7..fd53de666b9ae 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i32 %valB, ptr %out, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,8 +18,8 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
index 3592139a79f6a..d458103f8548c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i64 %valB, ptr %out, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,7 +18,7 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
index 0b96adabd8de3..e8e16e88b6622 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(/[0-9]+)? for VF [0-9]+ For instruction:\s*store i8 %valB, ptr %out, align 1"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(\.[0-9]+)? (\([0-9]+/[0-9]+\) )?for VF [0-9]+ For instruction:\s*store i8 %valB, ptr %out, align 1"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,10 +18,10 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 10/4 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 22/4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 46/4 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 94/4 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 11.50 (46/4) for VF 8 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 23.50 (94/4) for VF 16 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -38,7 +38,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX1:  LV: Found an estimated cost of 130/4 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX1:  LV: Found an estimated cost of 32.50 (130/4) for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -47,7 +47,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX2:  LV: Found an estimated cost of 130/4 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX2:  LV: Found an estimated cost of 32.50 (130/4) for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1

>From 71bdccc7c5c9d95ec5b53de9307d3bf2c3dc6405 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 10 Mar 2026 00:39:57 -0700
Subject: [PATCH 14/15] Format

---
 llvm/lib/Support/InstructionCost.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index 2abd3dce2c133..13e5053a7f53f 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -11,8 +11,8 @@
 /// instructions.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Format.h"
 #include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;

>From 0dcdb7d90f75eed12e8346779440cbafebe1b51e Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 10 Mar 2026 00:23:50 -0700
Subject: [PATCH 15/15] Fix InstructionCost print for large numbers and adjust
 formatting

---
 llvm/lib/Support/InstructionCost.cpp          |  19 ++-
 .../PowerPC/LoopnestFixedSize.ll              |   2 +-
 .../interchange-refcost-overflow.ll           |   6 +-
 .../LoopVectorize/ARM/scalar-block-cost.ll    |   2 +-
 .../WebAssembly/memory-interleave.ll          | 118 +++++++++---------
 .../interleaved-load-f32-stride-3.ll          |   2 +-
 .../interleaved-load-f32-stride-5.ll          |   2 +-
 .../interleaved-load-f32-stride-7.ll          |   4 +-
 .../interleaved-load-f64-stride-3.ll          |   2 +-
 .../interleaved-load-f64-stride-5.ll          |   2 +-
 .../interleaved-load-f64-stride-7.ll          |   4 +-
 .../interleaved-load-i16-stride-3.ll          |   2 +-
 .../interleaved-load-i16-stride-5.ll          |   2 +-
 .../interleaved-load-i16-stride-7.ll          |   4 +-
 .../interleaved-load-i32-stride-3.ll          |   2 +-
 ...erleaved-load-i32-stride-4-indices-012u.ll |   4 +-
 .../interleaved-load-i32-stride-5.ll          |   2 +-
 .../interleaved-load-i32-stride-7.ll          |   4 +-
 .../interleaved-load-i64-stride-3.ll          |   2 +-
 .../interleaved-load-i64-stride-5.ll          |   2 +-
 .../interleaved-load-i64-stride-7.ll          |   4 +-
 .../CostModel/interleaved-load-i8-stride-5.ll |   2 +-
 .../CostModel/interleaved-load-i8-stride-7.ll |   4 +-
 .../CostModel/masked-interleaved-store-i16.ll |   4 +-
 .../masked-scatter-i32-with-i8-index.ll       |  10 +-
 .../masked-scatter-i64-with-i8-index.ll       |   8 +-
 .../X86/CostModel/masked-store-i16.ll         |   4 +-
 .../X86/CostModel/masked-store-i32.ll         |   4 +-
 .../X86/CostModel/masked-store-i64.ll         |   2 +-
 .../X86/CostModel/masked-store-i8.ll          |  12 +-
 30 files changed, 125 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index 13e5053a7f53f..5e06b9109cd88 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -19,11 +19,20 @@ using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
   if (isValid()) {
-    if (Value % CostGranularity) {
-      double DecimalValue = static_cast<double>(Value) / CostGranularity;
-      OS << format("%.2f", DecimalValue) << " (" << Value << "/" << CostGranularity << ")";
-    } else {
-      OS << (Value / CostGranularity);
+    CostType WholeNumber = Value / CostGranularity;
+    CostType Remainder = Value % CostGranularity;
+    OS << WholeNumber;
+    assert(CostGranularity == 4 && "Hardcoded for CostGranularity=4");
+    switch (Remainder) {
+    case 1:
+      OS << ".25";
+      break;
+    case 2:
+      OS << ".5";
+      break;
+    case 3:
+      OS << ".75";
+      break;
     }
   } else {
     OS << "Invalid";
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index 8649598f7f7ab..02b09c0b3c684 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,7 +83,7 @@ for.end13:                                        ; preds = %for.inc11
 
 declare ptr @func_with_returned_arg(ptr returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 2305843009213693952.00 (9223372036854775807/4)
+; CHECK: Loop 'for.body' has cost = 2305843009213693951.75
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index 96b42dc9e5571..90dd96d322e92 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
-; CHECK: Loop 'middle.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
-; CHECK: Loop 'inner.loop' has cost = 2305843009213693952.00 (9223372036854775807/4)
+; CHECK: Loop 'outer.loop' has cost = 2305843009213693951.75
+; CHECK: Loop 'middle.loop' has cost = 2305843009213693951.75
+; CHECK: Loop 'inner.loop' has cost = 2305843009213693951.75
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 05064068c79bc..34c7381200aec 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -57,7 +57,7 @@ define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nsw i32 %i.032, 1
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %end
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.50 (34/4).
+; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.5.
 
 entry:
   %cmp31 = icmp slt i32 %start, %end
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index b81017047b34a..ada71fac561da 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -22,7 +22,7 @@ target triple = "wasm32-unknown-wasi"
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 13.50 (54/4).
+; CHECK: LV: Vector loop of width 2 costs: 13.5.
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -57,7 +57,7 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at %10
 ; CHECK: LV: Scalar loop costs: 12.
-; CHECK: LV: Vector loop of width 2 costs: 13.50 (54/4).
+; CHECK: LV: Vector loop of width 2 costs: 13.5.
 ; CHECK: LV: Vector loop of width 4 costs: 6.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
@@ -93,11 +93,11 @@ define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
+; CHECK: LV: Vector loop of width 2 costs: 30.5.
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i32, ptr %9
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i32, ptr %11
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
+; CHECK: LV: Vector loop of width 4 costs: 28.75.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -139,15 +139,15 @@ define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr n
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
+; CHECK: LV: Vector loop of width 2 costs: 30.5.
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
+; CHECK: LV: Vector loop of width 4 costs: 28.75.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
+; CHECK: LV: Vector loop of width 8 costs: 27.75.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -197,11 +197,11 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4)
+; CHECK: LV: Vector loop of width 8 costs: 26.5
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -257,11 +257,11 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %31
-; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4).
+; CHECK: LV: Vector loop of width 8 costs: 26.5.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -317,11 +317,11 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16
-; CHECK: LV: Vector loop of width 8 costs: 26.50 (106/4).
+; CHECK: LV: Vector loop of width 8 costs: 26.5.
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -371,7 +371,7 @@ define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %37
-; CHECK: LV: Vector loop of width 8 costs: 32.50 (130/4)
+; CHECK: LV: Vector loop of width 8 costs: 32.5
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 168 for VF 16 For instruction: store i8 %37
@@ -438,7 +438,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15.25 (61/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.25.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
@@ -446,7 +446,7 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 1.75 (7/4).
+; CHECK: LV: Vector loop of width 16 costs: 1.75.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -484,19 +484,19 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %13
-; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4).
+; CHECK: LV: Vector loop of width 2 costs: 23.5.
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 12.50 (50/4).
+; CHECK: LV: Vector loop of width 4 costs: 12.5.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 3.75 (15/4).
+; CHECK: LV: Vector loop of width 8 costs: 3.75.
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %19
-; CHECK: LV: Vector loop of width 16 costs: 1.50 (6/4).
+; CHECK: LV: Vector loop of width 16 costs: 1.5.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -528,16 +528,16 @@ define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0
 
 ; CHECK-LABEL: three_bytes_same_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
-; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
+; CHECK: LV: Vector loop of width 2 costs: 30.5.
+; CHECK: LV: Vector loop of width 4 costs: 28.75.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
+; CHECK: LV: Vector loop of width 8 costs: 27.75.
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 27.25 (109/4).
+; CHECK: LV: Vector loop of width 16 costs: 27.25.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -576,16 +576,16 @@ define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly
 
 ; CHECK-LABEL: three_bytes_interleave_op
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Vector loop of width 2 costs: 30.50 (122/4).
-; CHECK: LV: Vector loop of width 4 costs: 28.75 (115/4).
+; CHECK: LV: Vector loop of width 2 costs: 30.5.
+; CHECK: LV: Vector loop of width 4 costs: 28.75.
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 27.75 (111/4).
+; CHECK: LV: Vector loop of width 8 costs: 27.75.
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
 ; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 27.25 (109/4).
+; CHECK: LV: Vector loop of width 16 costs: 27.25.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -631,15 +631,15 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 10.75 (43/4).
+; CHECK: LV: Vector loop of width 8 costs: 10.75.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25.25 (101/4).
+; CHECK: LV: Vector loop of width 16 costs: 25.25.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -698,12 +698,12 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 11.50 (46/4).
+; CHECK: LV: Vector loop of width 8 costs: 11.5.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25.50 (102/4)
+; CHECK: LV: Vector loop of width 16 costs: 25.5
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -757,15 +757,15 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4)
+; CHECK: LV: Vector loop of width 4 costs: 15.5
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 10.75 (43/4)
+; CHECK: LV: Vector loop of width 8 costs: 10.75
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 25.25 (101/4)
+; CHECK: LV: Vector loop of width 16 costs: 25.25
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -817,7 +817,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 4 costs: 74.50 (298/4)
+; CHECK: LV: Vector loop of width 4 costs: 74.5
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %55
@@ -825,7 +825,7 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 51.75 (207/4)
+; CHECK: LV: Vector loop of width 16 costs: 51.75
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -901,7 +901,7 @@ define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 50.25 (201/4)
+; CHECK: LV: Vector loop of width 16 costs: 50.25
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -977,7 +977,7 @@ define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 50.25 (201/4)
+; CHECK: LV: Vector loop of width 16 costs: 50.25
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1126,7 +1126,7 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
-; CHECK: LV: Vector loop of width 2 costs: 35.50 (142/4).
+; CHECK: LV: Vector loop of width 2 costs: 35.5.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:  %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
@@ -1184,8 +1184,8 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK-LABEL: scale_uv_row_down2
 ; CHECK: LV: Scalar loop costs: 10.
 ; CHECK: LV: Vector loop of width 2 costs: 13.
-; CHECK: LV: Vector loop of width 4 costs: 8.75 (35/4).
-; CHECK: LV: Vector loop of width 8 costs: 4.75 (19/4).
+; CHECK: LV: Vector loop of width 4 costs: 8.75.
+; CHECK: LV: Vector loop of width 8 costs: 4.75.
 ; CHECK: LV: Vector loop of width 16 costs: 5.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
@@ -1219,7 +1219,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 4 costs: 18.25 (73/4).
+; CHECK: LV: Vector loop of width 4 costs: 18.25.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
@@ -1299,12 +1299,12 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: LV: Vector loop of width 2 costs: 25.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
-; CHECK: LV: Vector loop of width 4 costs: 11.75 (47/4).
+; CHECK: LV: Vector loop of width 4 costs: 11.75.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 6.75 (27/4).
+; CHECK: LV: Vector loop of width 8 costs: 6.75.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
-; CHECK: LV: Vector loop of width 16 costs: 10.75 (43/4).
+; CHECK: LV: Vector loop of width 16 costs: 10.75.
 ; CHECK: LV: Selecting VF: 8.
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1356,7 +1356,7 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1397,7 +1397,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 14.
 ; CHECK: LV: Vector loop of width 2 costs: 19.
-; CHECK: LV: Vector loop of width 4 costs: 15.50 (62/4).
+; CHECK: LV: Vector loop of width 4 costs: 15.5.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1519,7 +1519,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 16.75 (67/4).
+; CHECK: LV: Vector loop of width 4 costs: 16.75.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1559,7 +1559,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
 ; CHECK: LV: Vector loop of width 2 costs: 26
-; CHECK: LV: Vector loop of width 4 costs: 16.75 (67/4).
+; CHECK: LV: Vector loop of width 4 costs: 16.75.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1688,8 +1688,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4)
-; CHECK: LV: Vector loop of width 4 costs: 14.75 (59/4)
+; CHECK: LV: Vector loop of width 2 costs: 23.5
+; CHECK: LV: Vector loop of width 4 costs: 14.75
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1730,8 +1730,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 23.50 (94/4)
-; CHECK: LV: Vector loop of width 4 costs: 14.75 (59/4)
+; CHECK: LV: Vector loop of width 2 costs: 23.5
+; CHECK: LV: Vector loop of width 4 costs: 14.75
 ; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1997,7 +1997,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 31.50 (126/4)
+; CHECK: LV: Vector loop of width 4 costs: 31.5
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2054,7 +2054,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 48
-; CHECK: LV: Vector loop of width 4 costs: 31.50 (126/4)
+; CHECK: LV: Vector loop of width 4 costs: 31.5
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2236,7 +2236,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 29.50 (118/4)
+; CHECK: LV: Vector loop of width 4 costs: 29.5
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2294,7 +2294,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
 ; CHECK: LV: Vector loop of width 2 costs: 41
-; CHECK: LV: Vector loop of width 4 costs: 29.50 (118/4)
+; CHECK: LV: Vector loop of width 4 costs: 29.5
 ; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
index d1332da8d2f07..9db9552a5c8ce 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.5 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
index 5bc85746af9ba..67ced9388c171 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
@@ -134,7 +134,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9.5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
index b2fa902249018..97d2d39f300a0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
@@ -180,14 +180,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12.5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35.5 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
index 1b3b24b32e7c2..36d3e98c49b64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
@@ -88,7 +88,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6.5 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
index 5413f6bce5427..7101a92299bc0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
@@ -129,7 +129,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9.5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
index 6d62c5ab28e61..cb8a04eba2034 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
@@ -173,14 +173,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12.5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35.5 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
index 88a4beb8b5170..5ce556fb356de 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
@@ -55,7 +55,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 9.50 (38/4) for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 9.5 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
index f99d056c707f8..bf1ee0665d4ae 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
@@ -182,7 +182,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX512BW:  LV: Found an estimated cost of 14.50 (58/4) for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 14.5 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
index 0b14e7b637c25..0f6353f79ee4a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
@@ -246,14 +246,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 19.50 (78/4) for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 19.5 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX512BW:  LV: Found an estimated cost of 56.50 (226/4) for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 56.5 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
index ab7050f9e5a21..faa0d95c86fab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index 14b3256110df9..5cabbc505e85c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -43,8 +43,8 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 17.50 (70/4) for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6.5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 17.5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
index 623d37bf8c2c8..4b9af87b0ec99 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
@@ -134,7 +134,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9.5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
index beeea5b554208..360db0329aabf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
@@ -180,14 +180,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12.5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35.5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
 ; AVX512:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
index 24a2ef36decf9..078c9ef5769f2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
@@ -88,7 +88,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512:  LV: Found an estimated cost of 6.50 (26/4) for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6.5 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
index 07962ca560c20..6568dff68e34b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
@@ -129,7 +129,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512:  LV: Found an estimated cost of 9.50 (38/4) for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9.5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
index 3964b1d45ae3a..5282caa8d6adc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
@@ -173,14 +173,14 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 12.50 (50/4) for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12.5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512:  LV: Found an estimated cost of 35.50 (142/4) for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35.5 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
 ; AVX512:  LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
index 2e071f0df74c9..ece32265e2ace 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
@@ -187,7 +187,7 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX512BW:  LV: Found an estimated cost of 99.50 (398/4) for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 99.5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
index c2fccc77b7b5e..5a7c900d21116 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
@@ -253,14 +253,14 @@ define void @test() {
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 138.50 (554/4) for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 138.5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX512BW:  LV: Found an estimated cost of 413.50 (1654/4) for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 413.5 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
 ; AVX512BW:  LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index e2bc6aded0567..ff84b8bcd3550 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -157,7 +157,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.5 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
@@ -165,7 +165,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 16.5 for VF 16 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
index 46b3ff36be17e..121f1e8179373 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
@@ -19,8 +19,8 @@ define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 2.5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 5.5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -37,7 +37,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX1:  LV: Found an estimated cost of 8.50 (34/4) for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX1:  LV: Found an estimated cost of 8.5 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -46,7 +46,7 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
-; AVX2:  LV: Found an estimated cost of 8.50 (34/4) for VF 8 For instruction: store i32 %valB, ptr %out, align 4
+; AVX2:  LV: Found an estimated cost of 8.5 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
 ;
@@ -54,7 +54,7 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; AVX512:  LV: Found an estimated cost of 10.50 (42/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 10.5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
index 6ffedeac86fe9..0e8adf423b519 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
@@ -19,7 +19,7 @@ define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 2.5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
@@ -36,7 +36,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX1:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX1:  LV: Found an estimated cost of 4.5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2-NOFAST:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-NOFAST:  LV: Found an estimated cost of 4.5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-NOFAST:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
@@ -54,7 +54,7 @@ define void @test() {
 ; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX2-FAST:  LV: Found an estimated cost of 4.50 (18/4) for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX2-FAST:  LV: Found an estimated cost of 4.5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2-FAST:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
index 225d54e9c4c07..9a5a134aa59ee 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
@@ -29,7 +29,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX1:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX1:  LV: Found an estimated cost of 16.5 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX1:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX2-LABEL: 'test'
@@ -38,7 +38,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %valB, ptr %out, align 2
-; AVX2:  LV: Found an estimated cost of 16.50 (66/4) for VF 16 For instruction: store i16 %valB, ptr %out, align 2
+; AVX2:  LV: Found an estimated cost of 16.5 for VF 16 For instruction: store i16 %valB, ptr %out, align 2
 ; AVX2:  LV: Found an estimated cost of 33 for VF 32 For instruction: store i16 %valB, ptr %out, align 2
 ;
 ; AVX512-LABEL: 'test'
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
index fd53de666b9ae..5814821d7db92 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
@@ -18,8 +18,8 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 2.5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
+; SSE2:  LV: Found an estimated cost of 5.5 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; SSE2:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
index d458103f8548c..917a37293c866 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
@@ -18,7 +18,7 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i64 %valB, ptr %out, align 8
+; SSE2:  LV: Found an estimated cost of 2.5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; SSE2:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
index e8e16e88b6622..26bae6a8ea3cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
@@ -18,10 +18,10 @@ define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 2.50 (10/4) for VF 2 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 5.50 (22/4) for VF 4 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 11.50 (46/4) for VF 8 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  LV: Found an estimated cost of 23.50 (94/4) for VF 16 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 2.5 for VF 2 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 5.5 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 11.5 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
+; SSE2:  LV: Found an estimated cost of 23.5 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -38,7 +38,7 @@ define void @test(ptr %C) {
 ; AVX1:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX1:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX1:  LV: Found an estimated cost of 32.50 (130/4) for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX1:  LV: Found an estimated cost of 32.5 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -47,7 +47,7 @@ define void @test(ptr %C) {
 ; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i8 %valB, ptr %out, align 1
 ; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: store i8 %valB, ptr %out, align 1
-; AVX2:  LV: Found an estimated cost of 32.50 (130/4) for VF 32 For instruction: store i8 %valB, ptr %out, align 1
+; AVX2:  LV: Found an estimated cost of 32.5 for VF 32 For instruction: store i8 %valB, ptr %out, align 1
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1



More information about the llvm-commits mailing list