[llvm] 2a859b2 - [AArch64] Change the cost of vector insert/extract to 2

Fri Jul 28 13:26:57 PDT 2023

Author: David Green
Date: 2023-07-28T21:26:50+01:00
New Revision: 2a859b20146108af84c741a509dc0e534e045768

URL: https://github.com/llvm/llvm-project/commit/2a859b20146108af84c741a509dc0e534e045768
DIFF: https://github.com/llvm/llvm-project/commit/2a859b20146108af84c741a509dc0e534e045768.diff

LOG: [AArch64] Change the cost of vector insert/extract to 2

The cost of vector instructions has always been high under AArch64, in order to
add a high cost for inserts/extracts, shuffles and scalarization. This is a
conservative approach to limit the scope of unusual SLP vectorization where the
codegen ends up being quite poor, but has always been higher than the correct
costs would be for any specific core.

This relaxes that, reducing the vector insert/extract cost from 3 to 2. It is a
generalization of D142359 to all AArch64 cpus. The ScalarizationOverhead is
also overridden for integer vector at the same time, to remove the effect of
lane 0 being considered free for integer vectors (something that should only be
true for float when scalarizing).

The lower insert/extract cost will reduce the cost of insert, extracts,
shuffling and scalarization. The adjustments of ScalaizationOverhead will
increase the cost on integer, especially for small vectors. The end result will
be lower cost for float and long-integer types, some higher cost for some
smaller vectors. This, along with the raw insert/extract cost being lower, will
generally mean more vectorization from the Loop and SLP vectorizer.

We may end up regretting this, as that vectorization is not always profitable.
In all the benchmarking I have done this is generally an improvement in the
overall performance, and I've attempted to address the places where it wasn't
with other costmodel adjustments.

Differential Revision: https://reviews.llvm.org/D155459

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64Subtarget.h
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
    llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
    llvm/test/Analysis/CostModel/AArch64/bswap.ll
    llvm/test/Analysis/CostModel/AArch64/cast.ll
    llvm/test/Analysis/CostModel/AArch64/cmp.ll
    llvm/test/Analysis/CostModel/AArch64/ctlz.ll
    llvm/test/Analysis/CostModel/AArch64/cttz.ll
    llvm/test/Analysis/CostModel/AArch64/div.ll
    llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
    llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
    llvm/test/Analysis/CostModel/AArch64/fshl.ll
    llvm/test/Analysis/CostModel/AArch64/fshr.ll
    llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
    llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
    llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
    llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
    llvm/test/Analysis/CostModel/AArch64/min-max.ll
    llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
    llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
    llvm/test/Analysis/CostModel/AArch64/rem.ll
    llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
    llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
    llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
    llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
    llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
    llvm/test/Analysis/CostModel/AArch64/vector-select.ll
    llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
    llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
    llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
    llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
    llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
    llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 9ab86684856eca..5086155f526462 100644

--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -103,7 +103,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 #include "AArch64GenSubtargetInfo.inc"
 
   uint8_t MaxInterleaveFactor = 2;
-  uint8_t VectorInsertExtractBaseCost = 3;
+  uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 353e96856b8f36..73dac451dbfd15 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2560,6 +2560,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
   return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
 }
 
+InstructionCost AArch64TTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
+  if (isa<ScalableVectorType>(Ty))
+    return InstructionCost::getInvalid();
+  if (Ty->getElementType()->isFloatingPointTy())
+    return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                           CostKind);
+  return DemandedElts.popcount() * (Insert + Extract) *
+         ST->getVectorInsertExtractBaseCost();
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 787cb3c5d34b59..88e9bf70200399 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -385,6 +385,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                  VectorType *SubTp,
                                  ArrayRef<const Value *> Args = std::nullopt);
 
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
+
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.

diff  --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index dfbd2d8b39d003..c352892354fc24 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -198,16 +198,16 @@ define i32 @fdiv(i32 %arg) {
 define i32 @frem(i32 %arg) {
 ; CHECK-LABEL: 'frem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = frem half undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
index f88fe270a74102..91379b1f9f8747 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
@@ -355,9 +355,9 @@ declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x
 define i32 @smul(i32 %arg) {
 ; RECIP-LABEL: 'smul'
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -437,9 +437,9 @@ declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x
 define i32 @umul(i32 %arg) {
 ; RECIP-LABEL: 'umul'
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/bswap.ll b/llvm/test/Analysis/CostModel/AArch64/bswap.ll
index e5a1827e75a14d..6cb2975a12b82a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/bswap.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/bswap.ll
@@ -44,7 +44,7 @@ define void @neon() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 870ee4bdb39502..5dd37e83f4796f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -947,8 +947,8 @@ define i32 @casts_no_users() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -967,8 +967,8 @@ define i32 @casts_no_users() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
@@ -987,8 +987,8 @@ define i32 @casts_no_users() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
@@ -1373,8 +1373,8 @@ define i32 @casts_no_users() {
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -1586,8 +1586,8 @@ define i32 @casts_no_users() {
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
@@ -3214,9 +3214,9 @@ define i32 @store_truncs() {
 
 define void @extend_extract() {
 ; CHECK-LABEL: 'extend_extract'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32
@@ -3233,9 +3233,9 @@ define void @extend_extract() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SVE-LABEL: 'extend_extract'
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
-; SVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16
 ; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32
@@ -3292,38 +3292,38 @@ define void @fp16cast() {
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half>
@@ -3438,8 +3438,8 @@ define void @fp16cast() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3448,28 +3448,28 @@ define void @fp16cast() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half>
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half>
@@ -3511,8 +3511,8 @@ define void @fp16cast() {
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3523,8 +3523,8 @@ define void @fp16cast() {
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>
@@ -3584,8 +3584,8 @@ define void @fp16cast() {
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8>
@@ -3596,8 +3596,8 @@ define void @fp16cast() {
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16>

diff  --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index d595f5e9704169..4aed33f42d469e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -17,7 +17,7 @@ define void @cmps() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf16 = fcmp oge half undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf32 = fcmp ogt float undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf64 = fcmp ogt double undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv432 = fcmp oge <4 x float> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv264 = fcmp oge <2 x double> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void

diff  --git a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
index 10373d18a7a377..913a81b148c603 100644
--- a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll
@@ -55,7 +55,7 @@ declare i8 @llvm.ctlz.i8(i8)
 define <2 x i64> @test_ctlz_v2i64(<2 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_ctlz_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
@@ -154,7 +154,7 @@ define <16 x i8> @test_ctlz_v16i8(<16 x i8> %a) {
 
 define <4 x i64> @test_ctlz_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: 'test_ctlz_v4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/cttz.ll b/llvm/test/Analysis/CostModel/AArch64/cttz.ll
index e9c92302e5145c..5947e251d8a06c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz.ll
@@ -55,7 +55,7 @@ declare i8 @llvm.cttz.i8(i8)
 define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
@@ -65,7 +65,7 @@ define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) {
 define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %cttz
 ;
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
@@ -75,7 +75,7 @@ define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) {
 define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
@@ -85,7 +85,7 @@ define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) {
 define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %cttz
 ;
   %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
@@ -95,7 +95,7 @@ define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) {
 define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %cttz
 ;
   %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
@@ -105,7 +105,7 @@ define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) {
 define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
@@ -115,7 +115,7 @@ define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) {
 define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v2i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %cttz
 ;
   %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
@@ -125,7 +125,7 @@ define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) {
 define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %cttz
 ;
   %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
@@ -135,7 +135,7 @@ define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) {
 define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %cttz
 ;
   %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
@@ -145,7 +145,7 @@ define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) {
 define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v16i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
@@ -155,7 +155,7 @@ define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) {
 define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v4i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
@@ -165,7 +165,7 @@ define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) {
 define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v8i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
@@ -175,7 +175,7 @@ define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) {
 define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v16i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
@@ -185,7 +185,7 @@ define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) {
 define <32 x i8> @test_cttz_v32i8(<32 x i8> %a) {
 ;
 ; CHECK-LABEL: 'test_cttz_v32i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index 7486851657cee3..7b52e2dc18aa52 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -6,21 +6,21 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define i32 @sdiv() {
 ; CHECK-LABEL: 'sdiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, undef
@@ -49,21 +49,21 @@ define i32 @sdiv() {
 define i32 @udiv() {
 ; CHECK-LABEL: 'udiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, undef
@@ -92,21 +92,21 @@ define i32 @udiv() {
 define i32 @sdiv_const() {
 ; CHECK-LABEL: 'sdiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -135,21 +135,21 @@ define i32 @sdiv_const() {
 define i32 @udiv_const() {
 ; CHECK-LABEL: 'udiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
@@ -264,21 +264,21 @@ define i32 @udiv_uniformconst() {
 define i32 @sdiv_constpow2() {
 ; CHECK-LABEL: 'sdiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 16
@@ -307,21 +307,21 @@ define i32 @sdiv_constpow2() {
 define i32 @udiv_constpow2() {
 ; CHECK-LABEL: 'udiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 16
@@ -350,21 +350,21 @@ define i32 @udiv_constpow2() {
 define i32 @sdiv_uniformconstpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 244 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 500 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 16
@@ -436,21 +436,21 @@ define i32 @udiv_uniformconstpow2() {
 define i32 @sdiv_constnegpow2() {
 ; CHECK-LABEL: 'sdiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, -16
@@ -479,21 +479,21 @@ define i32 @sdiv_constnegpow2() {
 define i32 @udiv_constnegpow2() {
 ; CHECK-LABEL: 'udiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, -16

diff  --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
index b7975250fec453..a3524249684833 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -34,7 +34,7 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
@@ -233,46 +233,46 @@ define void @fp16() {
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 366 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 177 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 547 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 413 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 504 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 385 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 374 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 732 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 354 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 149 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 342 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-FP16-LABEL: 'fp16'
@@ -292,20 +292,20 @@ define void @fp16() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
@@ -314,8 +314,8 @@ define void @fp16() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
@@ -324,8 +324,8 @@ define void @fp16() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll b/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
index 1b62d681e9423b..7595abc71a9d8c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll
@@ -616,7 +616,7 @@ define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) {
 }
 
 ; COST-LABEL: neg_llegal_vector_type_3
-; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
+; COST-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
 define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) {
   %tmp0 = zext <3 x i34> %a to <3 x i68>
   %tmp1 = add <3 x i68> %b, %tmp0

diff  --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 6d4557cffd85ee..eab4efe0613b5d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -116,7 +116,7 @@ entry:
 
 define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
 ;
 entry:
@@ -148,7 +148,7 @@ entry:
 
 define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
 ;
 entry:
@@ -180,7 +180,7 @@ entry:
 
 define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
 ;
 entry:
@@ -212,7 +212,7 @@ entry:
 
 define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
 ;
 entry:
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshl
 ;
 entry:
@@ -236,7 +236,7 @@ declare <4 x i30> @llvm.fshl.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
 
 define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_
diff erent(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_
diff erent'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshl
 ;
 entry:
@@ -259,7 +259,7 @@ declare i66 @llvm.fshl.i66(i66, i66, i66)
 
 define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_
diff erent(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_
diff erent'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshl
 ;
 entry:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index ab40c287cb5e52..1876f0410c2337 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -116,7 +116,7 @@ entry:
 
 define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr
 ;
 entry:
@@ -148,7 +148,7 @@ entry:
 
 define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr
 ;
 entry:
@@ -180,7 +180,7 @@ entry:
 
 define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr
 ;
 entry:
@@ -212,7 +212,7 @@ entry:
 
 define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr
 ;
 entry:
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshr.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshr
 ;
 entry:
@@ -236,7 +236,7 @@ declare <4 x i30> @llvm.fshr.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
 
 define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_
diff erent(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_
diff erent'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshr
 ;
 entry:
@@ -259,7 +259,7 @@ declare i66 @llvm.fshr.i66(i66, i66, i66)
 
 define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_
diff erent(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_
diff erent'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshr
 ;
 entry:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
index aa512be5f40c59..836a028ad6aa52 100644
--- a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
@@ -21,8 +21,8 @@ define void @vector_reverse() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 

diff  --git a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
index 049e7d783b0cb4..babc1c42c74385 100644
--- a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
@@ -11,38 +11,38 @@ target triple = "aarch64--linux-gnu"
 
 define void @vectorInstrCost() {
 ; CHECK-LABEL: 'vectorInstrCost'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t1 = extractelement <8 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t2 = extractelement <8 x i8> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t3 = extractelement <4 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t4 = extractelement <4 x i16> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t5 = extractelement <2 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t6 = extractelement <2 x i32> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t7 = extractelement <2 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t8 = extractelement <2 x i64> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = extractelement <8 x i8> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = extractelement <8 x i8> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = extractelement <4 x i16> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = extractelement <4 x i16> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = extractelement <2 x i32> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t6 = extractelement <2 x i32> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t7 = extractelement <2 x i64> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = extractelement <2 x i64> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t9 = extractelement <4 x half> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t10 = extractelement <4 x half> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = extractelement <4 x half> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t11 = extractelement <2 x float> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t12 = extractelement <2 x float> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = extractelement <2 x float> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t14 = extractelement <2 x double> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = extractelement <2 x double> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; KRYO-LABEL: 'vectorInstrCost'
@@ -122,7 +122,7 @@ define void @vectorInstrCost() {
 define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_B'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2
 ;
 ; KRYO-LABEL: 'LD1_B'
@@ -139,7 +139,7 @@ entry:
 define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_H'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2
 ;
 ; KRYO-LABEL: 'LD1_H'
@@ -156,7 +156,7 @@ entry:
 define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_W'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2
 ;
 ; KRYO-LABEL: 'LD1_W'
@@ -173,7 +173,7 @@ entry:
 define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_X'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2
 ;
 ; KRYO-LABEL: 'LD1_X'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index 92ecfe8c436473..ee41c69baf2c8b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
index ab66d93bb8995a..bf8458ba56e848 100644
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -190,11 +190,11 @@ declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32 immarg, <4 x i1>,
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_constant_mask'
@@ -212,11 +212,11 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask'
@@ -235,11 +235,11 @@ declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32 immarg, <4
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_constant_mask'
@@ -257,11 +257,11 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask'
@@ -280,11 +280,11 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_constant_mask'
@@ -302,11 +302,11 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask'
@@ -325,11 +325,11 @@ declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32 immarg, <
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_constant_mask'
@@ -347,11 +347,11 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs)
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask'
@@ -370,11 +370,11 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr>, i32, <256 x
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_gather_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls'
@@ -394,11 +394,11 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr>, i32, <256
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
 ; CHECK-NEON-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float'
@@ -418,11 +418,11 @@ declare void @llvm.masked.scatter.v256i8.v256p0(<256 x i8>, <256 x ptr>, i32, <2
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls'
@@ -442,11 +442,11 @@ declare void @llvm.masked.scatter.v512f16.v512p0(<512 x half>, <512 x ptr>, i32,
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/min-max.ll b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
index d80eb280737c4e..013d4876c9f379 100644
--- a/llvm/test/Analysis/CostModel/AArch64/min-max.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
@@ -195,10 +195,10 @@ define void @smax() {
 define void @minnum16() {
 ; CHECK-NOF16-LABEL: 'minnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'minnum16'
@@ -220,10 +220,10 @@ define void @minnum16() {
 define void @maxnum16() {
 ; CHECK-NOF16-LABEL: 'maxnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'maxnum16'
@@ -288,10 +288,10 @@ define void @maxnum() {
 define void @minimum16() {
 ; CHECK-NOF16-LABEL: 'minimum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'minimum16'
@@ -313,10 +313,10 @@ define void @minimum16() {
 define void @maximum16() {
 ; CHECK-NOF16-LABEL: 'maximum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'maximum16'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index fee770de564b42..737437282d9bd2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -7,35 +7,35 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @strict_fp_reductions() {
 ; CHECK-LABEL: 'strict_fp_reductions'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FP16-LABEL: 'strict_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BF16-LABEL: 'strict_fp_reductions'
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -58,20 +58,20 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -81,20 +81,20 @@ define void @fast_fp_reductions() {
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -104,20 +104,20 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void

diff  --git a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
index f5a0d198f65022..dcca118871418d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
@@ -162,14 +162,14 @@ define void @reduce_smax() {
 
 define void @reduce_fmin16() {
 ; CHECK-NOF16-LABEL: 'reduce_fmin16'
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmin16'
@@ -196,14 +196,14 @@ define void @reduce_fmin16() {
 
 define void @reduce_fmax16() {
 ; CHECK-NOF16-LABEL: 'reduce_fmax16'
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmax16'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index ce5ff1e8737618..7519bf1f7c5c6f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -6,21 +6,21 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define i32 @srem() {
 ; CHECK-LABEL: 'srem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, undef
@@ -49,21 +49,21 @@ define i32 @srem() {
 define i32 @urem() {
 ; CHECK-LABEL: 'urem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, undef
@@ -92,21 +92,21 @@ define i32 @urem() {
 define i32 @srem_const() {
 ; CHECK-LABEL: 'srem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -135,21 +135,21 @@ define i32 @srem_const() {
 define i32 @urem_const() {
 ; CHECK-LABEL: 'urem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -178,21 +178,21 @@ define i32 @urem_const() {
 define i32 @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -221,21 +221,21 @@ define i32 @srem_uniformconst() {
 define i32 @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -264,21 +264,21 @@ define i32 @urem_uniformconst() {
 define i32 @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -307,21 +307,21 @@ define i32 @srem_constpow2() {
 define i32 @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 16
@@ -350,21 +350,21 @@ define i32 @urem_constpow2() {
 define i32 @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 404 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 16
@@ -393,21 +393,21 @@ define i32 @srem_uniformconstpow2() {
 define i32 @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 16
@@ -436,21 +436,21 @@ define i32 @urem_uniformconstpow2() {
 define i32 @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, -16
@@ -479,21 +479,21 @@ define i32 @srem_constnegpow2() {
 define i32 @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, -16
@@ -522,21 +522,21 @@ define i32 @urem_constnegpow2() {
 define i32 @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, -16
@@ -565,21 +565,21 @@ define i32 @srem_uniformconstnegpow2() {
 define i32 @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, -16

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
index b048f0e86be19e..ec95a313e11e20 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
@@ -265,13 +265,13 @@ entry:
 define <8 x i8> @ld1r_8b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_8b_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %lane
 ;
@@ -285,13 +285,13 @@ entry:
 define <16 x i8> @ld1r_16b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_16b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_16b_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %lane
 ;
@@ -305,13 +305,13 @@ entry:
 define <4 x i16> @ld1r_4h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_4h_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %lane
 ;
@@ -325,13 +325,13 @@ entry:
 define <8 x i16> @ld1r_8h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_8h_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %lane
 ;
@@ -345,13 +345,13 @@ entry:
 define <2 x i32> @ld1r_2s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_2s_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %lane
 ;
@@ -365,13 +365,13 @@ entry:
 define <4 x i32> @ld1r_4s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_4s_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %lane
 ;
@@ -385,13 +385,13 @@ entry:
 define <2 x i64> @ld1r_2d_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2d_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %lane
 ;
 ; CODESIZE-LABEL: 'ld1r_2d_int_shuff'
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %lane
 ;

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index dead3a6c4edc1d..141eaf42ad7844 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -87,42 +87,42 @@ define void @concat() {
 
 define void @insert_subvec() {
 ; CHECK-LABEL: 'insert_subvec'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_2_1 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -169,16 +169,16 @@ define void @insert_subvec() {
 
 define void @multipart() {
 ; CHECK-LABEL: 'multipart'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16b = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index bcb4f0115a2e63..075397afdce791 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 ; COST-LABEL: sel.v8i8
-; COST:       Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CODE-LABEL: sel.v8i8
 ; CODE:       tbl v0.8b, { v0.16b }, v1.8b
 define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
@@ -13,7 +13,7 @@ define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 }
 
 ; COST-LABEL: sel.v16i8
-; COST:       Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; COST:       Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CODE-LABEL: sel.v16i8
 ; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
@@ -32,7 +32,7 @@ define <4 x i16> @sel.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 }
 
 ; COST-LABEL: sel.v8i16
-; COST:       Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CODE-LABEL: sel.v8i16
 ; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) {

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
index 29829a2ce9f2de..e6357dacf95d83 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
@@ -9,11 +9,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @ins_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -50,13 +50,13 @@ define void @ins_el0() #0 {
 
 define void @ins_el1() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el1'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement <vscale x 16 x i1> zeroinitializer, i1 false, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 1
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-LOW-LABEL: 'ins_el1'
@@ -92,11 +92,11 @@ define void @ins_el1() #0 {
 
 define void @ext_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -133,13 +133,13 @@ define void @ext_el0() #0 {
 
 define void @ext_el1() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el1'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 1
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement <vscale x 16 x i1> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 1
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 1
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-LOW-LABEL: 'ext_el1'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 81f100beb316a9..3b25f4e7ccb945 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -6,8 +6,8 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-LABEL: 'vector_insert_extract'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -618,7 +618,7 @@ define void @get_lane_mask() #0 {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -633,16 +633,16 @@ define void @get_lane_mask() #0 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
@@ -746,11 +746,11 @@ define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale
 
 define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
@@ -759,7 +759,7 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32>
 
 define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i128> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128'
@@ -800,11 +800,11 @@ define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr>
 
 define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -814,7 +814,7 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %ma
 
 define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128'

diff  --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index 40062605c47c28..5578e609eca329 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -140,7 +140,7 @@ define <3 x i64> @v3i64_select_sle(<3 x i64> %a, <3 x i64> %b, <3 x i64> %c) {
 }
 
 ; COST-LABEL: v2i64_select_no_cmp
-; COST-NEXT:  Cost Model: Found an estimated cost of 5 for instruction:   %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
+; COST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction:   %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
 
 ; CODE-LABEL: v2i64_select_no_cmp
 ; CODE:       bb.0
@@ -175,7 +175,7 @@ define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_ogt
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp ogt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -259,7 +259,7 @@ define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_oge
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp oge <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -343,7 +343,7 @@ define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_olt
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp olt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -427,7 +427,7 @@ define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_ole
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp ole <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -511,7 +511,7 @@ define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_oeq
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp oeq <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -597,7 +597,7 @@ define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_one
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
@@ -689,7 +689,7 @@ define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-LABEL: v8f16_select_une
-; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 29 for instruction:   %cmp.1 = fcmp une <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 22 for instruction:   %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-FULLFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction:  %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index cc6ef0feecffd2..dd3d4215a3f63a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
index 0184daf5b6fe61..430b0db87b8846 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -7,7 +7,7 @@ target triple = "aarch64--linux-gnu"
 %pair = type { i8, i8 }
 
 ; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 14 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 16 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: vector.body:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index c61ddbcb3408a5..bf9146bbad4cfa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -168,10 +168,10 @@ entry:
 ; gaps.
 ;
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
-; VF_2:         Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2:         Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 6f6ce7249d411e..37ac570aa06c6a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index ff544df55286c5..1fcbc8470fc3cb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -16,10 +16,10 @@ target triple = "aarch64--linux-gnu"
 ; as:
 ;
 ; Cost of udiv:
-;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ;
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
 entry:
@@ -57,10 +57,10 @@ for.end:
 ; as:
 ;
 ; Cost of store:
-;   (store(4) + extractelement(3)) / 2 = 3
+;   (store(4) + extractelement(4)) / 2 = 4
 ;
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
 ;
 define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -94,7 +94,7 @@ for.end:
 ; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
 ;
 define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -129,14 +129,14 @@ for.end:
 ; compute the cost as:
 ;
 ; Cost of add:
-;   (add(2) + extractelement(3)) / 2 = 2
+;   (add(2) + extractelement(4)) / 2 = 3
 ; Cost of udiv:
-;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;   (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
 ;
 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
-; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -174,13 +174,13 @@ for.end:
 ; compute the cost as:
 ;
 ; Cost of add:
-;   (add(2) + extractelement(3)) / 2 = 2
+;   (add(2) + extractelement(4)) / 2 = 3
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
 ;
 define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
@@ -219,11 +219,11 @@ for.end:
 ; Cost of add:
 ;   add(1) = 1
 ; Cost of sdiv:
-;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ; Cost of udiv:
-;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ; Cost of sub:
-;   (sub(2) + extractelement(3)) / 2 = 2
+;   (sub(2) + extractelement(4)) / 2 = 3
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
@@ -233,9 +233,9 @@ for.end:
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
-; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
 ;
 define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
index 39313c1b93d23d..622b269e05950a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
@@ -6,8 +6,8 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) {
 entry:
@@ -28,8 +28,8 @@ for.end:
 }
 
 
-; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 12 for VF 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 24 for VF 8 For instruction:   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) {
 entry:
@@ -49,8 +49,8 @@ for.end:
   ret double %add
 }
 
-; CHECK-VF4: Found an estimated cost of 19 for VF 4 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
-; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
 
 define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) {
 entry:
@@ -74,8 +74,8 @@ for.end:
 
 declare float @llvm.fmuladd.f32(float, float, float)
 
-; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
-; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
 
 define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) {
 entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
index 9b635bf117ad5a..fc20277b1d8588 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
@@ -1,14 +1,13 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
 
-; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
-; then test an even smaller VF of 2 is selected based on the cost-model.
+; Specify a large unsafe vectorization factor of 32 that gets clamped to 16.
 
 ; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
 ; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
-; CHECK: LV: Selecting VF: 2.
+; CHECK: LV: Selecting VF: 16.
 ; CHECK-LABEL: @test
-; CHECK: <2 x i64>
+; CHECK: <16 x i64>
 define void @test(ptr nocapture %a, ptr nocapture readonly %b) {
 entry:
   br label %loop.header

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
index aca9b69861223b..4ea6d7f4283909 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
@@ -5,46 +5,10 @@
 define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) {
 ; CHECK-LABEL: @dotproduct_float_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x float> [[A:%.*]], <6 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 4>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 5>
-; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <6 x float> [[B:%.*]], <6 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x float> [[SPLIT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <1 x float> [[SPLIT1]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK7]], <1 x float> [[SPLAT_SPLAT9]], <1 x float> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <1 x float> [[SPLIT2]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <1 x float> [[SPLIT3]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK13]], <1 x float> [[SPLAT_SPLAT15]], <1 x float> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <1 x float> [[SPLIT4]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <1 x float> [[SPLIT5]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK19]], <1 x float> [[SPLAT_SPLAT21]], <1 x float> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x float> poison, <1 x float> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x float> [[TMP13]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    ret <1 x float> [[TMP2]]
 ;
 entry:
   %c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1)
@@ -175,51 +139,12 @@ declare <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double>, <6 x
 define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) {
 ; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[LHS_ADDRESS:%.*]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 1
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[VEC_GEP]], align 4
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 2
-; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[VEC_GEP2]], align 4
-; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 3
-; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <1 x double>, ptr [[VEC_GEP4]], align 4
-; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 4
-; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <1 x double>, ptr [[VEC_GEP6]], align 4
-; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 5
-; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[VEC_GEP8]], align 4
-; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x double> [[COL_LOAD]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK11:%.*]] = shufflevector <1 x double> [[COL_LOAD1]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK11]], <1 x double> [[SPLAT_SPLAT13]], <1 x double> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <1 x double> [[COL_LOAD3]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK14]], <1 x double> [[SPLAT_SPLAT16]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <1 x double> [[COL_LOAD5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK17]], <1 x double> [[SPLAT_SPLAT19]], <1 x double> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <1 x double> [[COL_LOAD7]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK20]], <1 x double> [[SPLAT_SPLAT22]], <1 x double> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK23:%.*]] = shufflevector <1 x double> [[COL_LOAD9]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK23]], <1 x double> [[SPLAT_SPLAT25]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x double> [[TMP13]]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    ret <1 x double> [[TMP3]]
 ;
 entry:
   %lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index e3ad918498ee7f..cef791633655a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -12,18 +12,21 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
@@ -58,23 +61,25 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 9efad068b8ec0b..3a2a482a1ae7fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"
 ; REMARK-LABEL: Function: gather_multiple_use
 ; REMARK:       Args:
 ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
-; REMARK-NEXT:    - Cost: '-7'
+; REMARK-NEXT:    - Cost: '-8'
 ;
 ; REMARK-NOT: Function: gather_load
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index a567b6c71e898d..c36c8c27a8ef7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -30,7 +30,7 @@ target triple = "aarch64--linux-gnu"
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '4'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
index 8e32a14b3a4b4a..e4b6c06b79fc1b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
@@ -8,7 +8,7 @@
 ; YAML-NEXT: Function:        foo
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '3'
+; YAML-NEXT:   - Cost:            '2'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '2'
 
@@ -28,7 +28,7 @@
 ; YAML-NEXT: Function:        foo
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '1'
+; YAML-NEXT:   - Cost:            '2'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '9'
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
index ffa2713ac024a1..10f07f158175d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -11,66 +11,46 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapture readonly %B) {
 ; CHECK-LABEL: @wrap_mul4(
 ; CHECK-NEXT:    [[TEMP:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TEMP1:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 0, i64 1
 ; CHECK-NEXT:    [[TEMP2:%.*]] = load double, ptr [[ARRAYIDX5_I]], align 8
-; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 0
-; CHECK-NEXT:    [[TEMP3:%.*]] = load double, ptr [[ARRAYIDX7_I]], align 8
-; CHECK-NEXT:    [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = fadd double [[MUL_I]], [[MUL8_I]]
-; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 1
-; CHECK-NEXT:    [[TEMP4:%.*]] = load double, ptr [[ARRAYIDX13_I]], align 8
-; CHECK-NEXT:    [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]]
-; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 1
-; CHECK-NEXT:    [[TEMP5:%.*]] = load double, ptr [[ARRAYIDX18_I]], align 8
-; CHECK-NEXT:    [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]]
-; CHECK-NEXT:    [[ADD20_I:%.*]] = fadd double [[MUL14_I]], [[MUL19_I]]
+; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B:%.*]], i64 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 2
-; CHECK-NEXT:    [[TEMP6:%.*]] = load double, ptr [[ARRAYIDX25_I]], align 8
-; CHECK-NEXT:    [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]]
 ; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 2
-; CHECK-NEXT:    [[TEMP7:%.*]] = load double, ptr [[ARRAYIDX30_I]], align 8
-; CHECK-NEXT:    [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]]
-; CHECK-NEXT:    [[ADD32_I:%.*]] = fadd double [[MUL26_I]], [[MUL31_I]]
-; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 3
-; CHECK-NEXT:    [[TEMP8:%.*]] = load double, ptr [[ARRAYIDX37_I]], align 8
-; CHECK-NEXT:    [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]]
-; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 3
-; CHECK-NEXT:    [[TEMP9:%.*]] = load double, ptr [[ARRAYIDX42_I]], align 8
-; CHECK-NEXT:    [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]]
-; CHECK-NEXT:    [[ADD44_I:%.*]] = fadd double [[MUL38_I]], [[MUL43_I]]
 ; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 0
 ; CHECK-NEXT:    [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
-; CHECK-NEXT:    [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]]
 ; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
 ; CHECK-NEXT:    [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
-; CHECK-NEXT:    [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD56_I:%.*]] = fadd double [[MUL50_I]], [[MUL55_I]]
-; CHECK-NEXT:    [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD68_I:%.*]] = fadd double [[MUL62_I]], [[MUL67_I]]
-; CHECK-NEXT:    [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD80_I:%.*]] = fadd double [[MUL74_I]], [[MUL79_I]]
-; CHECK-NEXT:    [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]]
-; CHECK-NEXT:    [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]]
-; CHECK-NEXT:    [[ADD92_I:%.*]] = fadd double [[MUL86_I]], [[MUL91_I]]
-; CHECK-NEXT:    store double [[ADD_I]], ptr [[OUT:%.*]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 1
-; CHECK-NEXT:    store double [[ADD20_I]], ptr [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store double [[ADD32_I]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 3
-; CHECK-NEXT:    store double [[ADD44_I]], ptr [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX7_I]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[OUT]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
-; CHECK-NEXT:    store double [[ADD56_I]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 5
-; CHECK-NEXT:    store double [[ADD68_I]], ptr [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]]
+; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
-; CHECK-NEXT:    store double [[ADD80_I]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 7
-; CHECK-NEXT:    store double [[ADD92_I]], ptr [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %temp = load double, ptr %A, align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 5bdbfbbfe96df2..70cdd08548b2df 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -654,18 +654,17 @@ bb23:
 define void @single_membound(ptr %arg, ptr %arg1, double %x) {
 ; CHECK-LABEL: @single_membound(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT:    [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
 ; CHECK-NEXT:    store double [[TMP]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARG1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 2
 ; CHECK-NEXT:    br label [[BB15:%.*]]
 ; CHECK:       bb15:
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01
-; CHECK-NEXT:    store double [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01
-; CHECK-NEXT:    store double [[TMP17]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.000000e+01, double 3.000000e+01>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1232,27 +1231,25 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[T19:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
 ; CHECK-NEXT:    [[T20:%.*]] = load float, ptr [[ARG_3:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[T20]], i32 1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]]
 ; CHECK:       bb22:
 ; CHECK-NEXT:    [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
-; CHECK-NEXT:    [[T24:%.*]] = fmul float [[T23]], 9.900000e+01
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
-; CHECK-NEXT:    [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
-; CHECK-NEXT:    store float [[T26]], ptr [[T25]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    store float [[TMP4]], ptr [[T25]], align 4
 ; CHECK-NEXT:    [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
-; CHECK-NEXT:    [[T28:%.*]] = fadd float [[T24]], 2.000000e+01
-; CHECK-NEXT:    [[T29:%.*]] = fadd float [[T26]], 2.000000e+01
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
 ; CHECK-NEXT:    br label [[BB30]]
 ; CHECK:       bb30:
-; CHECK-NEXT:    [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[BB36:%.*]]
 ; CHECK:       bb36:
-; CHECK-NEXT:    [[T37:%.*]] = fmul float [[T31]], 3.000000e+00
-; CHECK-NEXT:    store float [[T37]], ptr [[ARG_3]], align 4
-; CHECK-NEXT:    [[T39:%.*]] = fmul float [[T32]], 3.000000e+00
-; CHECK-NEXT:    [[T40:%.*]] = getelementptr inbounds float, ptr [[ARG_3]], i64 1
-; CHECK-NEXT:    store float [[T39]], ptr [[T40]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
 ; CHECK-NEXT:    br label [[BB41:%.*]]
 ; CHECK:       bb41:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index c9e79c84dbf8e6..f85f658fed4d5c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,389 +14,168 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
-; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
-; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
-; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
-; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
-; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
-; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
-; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
-; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
-; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
-; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
-; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
-; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
-; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
-; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
-; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
-; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
-; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
-; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
-; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
-; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
-; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
-; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
-; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
-; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
-; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
-; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
-; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
-; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
-; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
-; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
-; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
-; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
-; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
-; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
-; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
-; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
-; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
-; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
-; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
-; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
-; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
-; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
-; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
-; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
-; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
-; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
-; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
-; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
-; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
-; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
-; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
-; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
-; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
-; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
-; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
-; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
-; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
-; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
-; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
-; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
-; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
-; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
-; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
-; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
-; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
-; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
-; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
-; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
-; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
-; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
-; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
-; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
-; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
-; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
-; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
-; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
-; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
-; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
-; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
-; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
-; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
-; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
-; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
-; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
-; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
-; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
-; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
-; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
-; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
-; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
-; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
-; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
-; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
-; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
-; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
-; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
-; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
-; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
-; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
-; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
-; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
-; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
-; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
-; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
-; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
-; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
-; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
-; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
-; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
-; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
-; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
-; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
-; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
-; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
-; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
-; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
-; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
-; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
-; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
-; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
-; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
-; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
-; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
-; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
-; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
-; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
-; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
-; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
-; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
-; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
-; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
-; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
-; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
-; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
-; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
-; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
-; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
-; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
-; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
-; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
-; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
-; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
-; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
-; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
-; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
-; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
-; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
-; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
-; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
-; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
-; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
-; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
-; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
-; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
-; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
-; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
-; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
-; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
-; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
-; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
-; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
-; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
-; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
-; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
-; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
-; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
-; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
-; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
-; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
-; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
-; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
-; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
-; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
-; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
-; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
-; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
-; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
-; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
-; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
-; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
-; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
-; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
-; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
-; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
-; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
-; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
-; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
-; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
-; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
-; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
-; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
-; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
-; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
-; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
-; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
-; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
-; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
-; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
-; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
-; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
-; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
-; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
-; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
-; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
-; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
-; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
-; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
-; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
-; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
-; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
-; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
-; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
-; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
-; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
-; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
-; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]]
-; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
-; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
-; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
-; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]]
-; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
-; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
-; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
-; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
-; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]]
-; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
-; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
-; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
-; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
-; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]]
-; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
-; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
-; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
-; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
-; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]]
-; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
-; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
-; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
-; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
-; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]]
-; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
-; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
-; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
-; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
-; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]]
-; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
-; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
-; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
-; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
-; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]]
-; CHECK-NEXT:    [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]]
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]]
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]]
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]]
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]]
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]]
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]]
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]]
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]]
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]]
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]]
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]]
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]]
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]]
+; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]])
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP89]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
@@ -798,111 +577,23 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    [[Y_038:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC13:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD11_15:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_15:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
-; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
-; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
-; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
-; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
-; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
-; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
-; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
-; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
-; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
-; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
-; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
-; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
-; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
-; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
-; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
-; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
-; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
-; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
-; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
-; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
-; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
-; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
-; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
-; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
-; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
-; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
-; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
-; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
-; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
-; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
-; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
-; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
-; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
-; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
-; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
-; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
-; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
-; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
-; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
-; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
-; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
-; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
-; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
-; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
-; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
-; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD_15]] = add i32 [[ADD_14]], [[CONV_15]]
-; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
-; CHECK-NEXT:    [[ADD11_15]] = add i32 [[MUL_15]], [[ADD11_14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_15]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_15]] to i64
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[OP_RDX1]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 4214ecf0bb7871..0b26c53ca45030 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -206,22 +206,22 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-LABEL: @slp_not_profitable_in_loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
 ; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
-; CHECK-NEXT:    [[MUL14:%.*]] = fmul fast float [[X:%.*]], [[L_2]]
 ; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
-; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[MUL14]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
 ; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 10

diff  --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index 8ea96918335c78..1ed1476f6e2345 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -505,9 +505,9 @@ define i31 @load_with_non_power_of_2_element_type(ptr %x) {
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
 ; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0
+; CHECK-NEXT:    [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -521,10 +521,9 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X]], i32 0, i32 6
-; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0
+; CHECK-NEXT:    [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;