[llvm] d137f12 - [X86][LV] X86 does not prefer vectorized addressing

Sat Oct 16 02:39:58 PDT 2021

Author: Roman Lebedev
Date: 2021-10-16T12:32:18+03:00
New Revision: d137f1288e2c2169b53a1baef0d5cd94a4bb3999

URL: https://github.com/llvm/llvm-project/commit/d137f1288e2c2169b53a1baef0d5cd94a4bb3999
DIFF: https://github.com/llvm/llvm-project/commit/d137f1288e2c2169b53a1baef0d5cd94a4bb3999.diff

LOG: [X86][LV] X86 does *not* prefer vectorized addressing

And another attempt to start untangling this ball of threads around gather.
There's `TTI::prefersVectorizedAddressing()`hoop, which confusingly defaults to `true`,
which tells LV to try to vectorize the addresses that lead to loads,
but X86 generally can not deal with vectors of addresses,
the only instructions that support that are GATHER/SCATTER,
but even those aren't available until AVX2, and aren't really usable until AVX512.

This specializes the hook for X86, to return true only if we have AVX512 or AVX2 w/ fast gather.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D111546

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.h
    llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
    llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
    llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
    llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 93ddf20b29aab..f1c8d1edae33a 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3422,7 +3422,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
                        Opcode == Instruction::InsertElement)) {
     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
-    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 
+    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
 
     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
@@ -4878,11 +4878,15 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
   return isLegalMaskedExpandLoad(DataTy);
 }
 
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+bool X86TTIImpl::supportsGather() const {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
   // enable gather with a -march.
-  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+  return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+  if (!supportsGather())
     return false;
 
   // This function is called now in two cases: from the Loop Vectorizer
@@ -5006,6 +5010,14 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+bool X86TTIImpl::prefersVectorizedAddressing() const {
+  return supportsGather();
+}
+
+bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
+  return false;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index f31f575badfb0..c382cca25b2dc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -235,9 +235,12 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                                     SmallPtrSetImpl<Argument *> &Args) const;
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool prefersVectorizedAddressing() const;
+  bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
 private:
+  bool supportsGather() const;
   InstructionCost getGSScalarCost(unsigned Opcode, Type *DataTy,
                                   bool VariableMask, Align Alignment,
                                   unsigned AddressSpace);

diff  --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
index 6d856aaf45d41..942bad54eb3d9 100644
--- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 192 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 194 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 388 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 194 for VF 16 For instruction:   %valB = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 388 for VF 32 For instruction:   %valB = load i16, i16* %inB, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
index bc82e61775a14..a74a80775d9e7 100644
--- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 102 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 204 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 196 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 392 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 98 for VF 8 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 196 for VF 16 For instruction:   %valB = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 392 for VF 32 For instruction:   %valB = load i32, i32* %inB, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
index 4408baa5a5dc8..6cca9cf32fc82 100644
--- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 100 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 200 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 102 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 204 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 100 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 200 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 400 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 50 for VF 4 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 100 for VF 8 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 200 for VF 16 For instruction:   %valB = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 400 for VF 32 For instruction:   %valB = load i64, i64* %inB, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
index e33d2373a19b4..d7a21f5880c10 100644
--- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 103 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 207 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 51 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 103 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 207 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 192 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 386 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 4 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 8 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 192 for VF 16 For instruction:   %valB = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 386 for VF 32 For instruction:   %valB = load i8, i8* %inB, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
index a519014b28f85..80a9215b87cca 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
@@ -16,19 +16,22 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 42 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
+; SSE2: LV: Found an estimated cost of 84 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-;
+; AVX2: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
index 850d5cb5353cd..4970d17942258 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 34 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX2: LV: Found an estimated cost of 304 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index fb9d96e2ac05a..703f62bc9b375 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
 ;;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 066f79d67fd1a..31b7c415a9a32 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -15,16 +15,19 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index 8868b6baa0ce1..9ebb546f06366 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
+; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
index 0a980370b4e54..7a9a052e87866 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 516 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
index 8e2ee4e7aaf65..19f76d9a9499b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 15 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
index 3dbd7aa6b154c..caa98c9160b6a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 124 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 47 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 94 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 188 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 188 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
index fc948808fc6ba..2eab36508045e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 25 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 100 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
index 836bd0383f4c2..d827effecc383 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 276 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
index f4349baaf075f..78c209fa10614 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 280 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
index 25ee9f5c31a81..2a3cabfdf97ad 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 22 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 192 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 192 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
index b0b1165d4d806..d1660b54645df 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 15 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 26 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 52 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 104 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
index c9abb108ce5ad..c1e591bed1ce2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2: LV: Found an estimated cost of 368 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index c0dfd2c109947..adfe1088534e9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index 0f993b22bf7c3..9f6e3807d2d55 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -15,16 +15,19 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
index 2d80ea5194b99..875639f212593 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store float %v2, float* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   store float %v2, float* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   store float %v2, float* %out2, align 4
+; SSE2: LV: Found an estimated cost of 96 for VF 16 For instruction:   store float %v2, float* %out2, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction:   store float %v2, float* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 23 for VF 4 For instruction:   store float %v2, float* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction:   store float %v2, float* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   store float %v2, float* %out2, align 4
+; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   store float %v2, float* %out2, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store float %v2, float* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store float %v2, float* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store float %v2, float* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction:   store float %v2, float* %out2, align 4
+; AVX2: LV: Found an estimated cost of 228 for VF 32 For instruction:   store float %v2, float* %out2, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store float %v2, float* %out2, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
index 6b520191af60f..d77ab5609549d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store float %v3, float* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   store float %v3, float* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   store float %v3, float* %out3, align 4
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   store float %v3, float* %out3, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction:   store float %v3, float* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   store float %v3, float* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction:   store float %v3, float* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction:   store float %v3, float* %out3, align 4
+; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction:   store float %v3, float* %out3, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store float %v3, float* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   store float %v3, float* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   store float %v3, float* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   store float %v3, float* %out3, align 4
+; AVX2: LV: Found an estimated cost of 304 for VF 32 For instruction:   store float %v3, float* %out3, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store float %v3, float* %out3, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
index 88f739525b932..186cd4a5186ec 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v1, double* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store double %v1, double* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction:   store double %v1, double* %out1, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction:   store double %v1, double* %out1, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v1, double* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   store double %v1, double* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   store double %v1, double* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   store double %v1, double* %out1, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store double %v1, double* %out1, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store double %v1, double* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   store double %v1, double* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   store double %v1, double* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   store double %v1, double* %out1, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 32 For instruction:   store double %v1, double* %out1, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store double %v1, double* %out1, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
index 1d6cad9fade2b..0e9773e0d08c4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
@@ -15,16 +15,19 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store double %v3, double* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   store double %v3, double* %out3, align 8
+; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   store double %v3, double* %out3, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction:   store double %v3, double* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction:   store double %v3, double* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction:   store double %v3, double* %out3, align 8
+; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction:   store double %v3, double* %out3, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   store double %v3, double* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store double %v3, double* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   store double %v3, double* %out3, align 8
+; AVX2: LV: Found an estimated cost of 128 for VF 16 For instruction:   store double %v3, double* %out3, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store double %v3, double* %out3, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
index d530664146acd..3bd9f91f73511 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
 ; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
 ; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
+; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX2: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
index 6df7dd58dfad1..b4693d1c3916a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
 ; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
 ; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
+; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction:   store i16 %v5, i16* %out5, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX2: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX2: LV: Found an estimated cost of 516 for VF 32 For instruction:   store i16 %v5, i16* %out5, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
index e90b171ca42fc..80b6531f9d38d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
+; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 29 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
+; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction:   store i32 %v2, i32* %out2, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
+; AVX2: LV: Found an estimated cost of 276 for VF 32 For instruction:   store i32 %v2, i32* %out2, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
index c9ad8a1505199..a5e2645bf1e97 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
+; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 19 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 38 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
+; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction:   store i32 %v3, i32* %out3, align 4
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
+; AVX2: LV: Found an estimated cost of 368 for VF 32 For instruction:   store i32 %v3, i32* %out3, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
index 56e86b433e7ba..06d1c6e0c8629 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
@@ -16,18 +16,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction:   store i64 %v1, i64* %out1, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 32 For instruction:   store i64 %v1, i64* %out1, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
index a638d07a76493..c48b1d7aa41f3 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
@@ -15,16 +15,19 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
+; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
+; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction:   store i64 %v3, i64* %out3, align 8
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
+; AVX2: LV: Found an estimated cost of 208 for VF 16 For instruction:   store i64 %v3, i64* %out3, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 3196dd057d93c..fee0dfa856119 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -60,8 +60,9 @@ for.end:                                          ; preds = %for.body
 }
 
 ; This function uses a stride that is generally too big to benefit from vectorization without
-; really good support for a gather load. We were not computing an accurate cost for the
-; vectorization and subsequent scalarization of the pointer induction variables.
+; really good support for a gather load. But if we don't vectorize the pointer induction,
+; then we don't need to extract the pointers out of vector of pointers,
+; and the vectorization becomes profitable.
 
 define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
 ; CHECK-LABEL: @PR27826(
@@ -70,21 +71,171 @@ define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b
 ; CHECK-NEXT:    br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[T0:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[T0]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 128
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 160
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 192
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 224
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 256
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 288
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 320
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 352
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 384
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 416
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 448
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 480
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, float* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, float* [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, float* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load float, float* [[TMP28]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load float, float* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load float, float* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, float* [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load float, float* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, float* [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> poison, float [[TMP59]], i32 0
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 1
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 2
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[TMP62]], i32 3
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP83:%.*]] = load float, float* [[TMP67]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = load float, float* [[TMP68]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = load float, float* [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP86:%.*]] = load float, float* [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0
+; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1
+; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3
+; CHECK-NEXT:    [[TMP91:%.*]] = load float, float* [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = load float, float* [[TMP72]], align 4
+; CHECK-NEXT:    [[TMP93:%.*]] = load float, float* [[TMP73]], align 4
+; CHECK-NEXT:    [[TMP94:%.*]] = load float, float* [[TMP74]], align 4
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0
+; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3
+; CHECK-NEXT:    [[TMP99:%.*]] = load float, float* [[TMP75]], align 4
+; CHECK-NEXT:    [[TMP100:%.*]] = load float, float* [[TMP76]], align 4
+; CHECK-NEXT:    [[TMP101:%.*]] = load float, float* [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP102:%.*]] = load float, float* [[TMP78]], align 4
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0
+; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1
+; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2
+; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3
+; CHECK-NEXT:    [[TMP107:%.*]] = load float, float* [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP108:%.*]] = load float, float* [[TMP80]], align 4
+; CHECK-NEXT:    [[TMP109:%.*]] = load float, float* [[TMP81]], align 4
+; CHECK-NEXT:    [[TMP110:%.*]] = load float, float* [[TMP82]], align 4
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x float> poison, float [[TMP107]], i32 0
+; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x float> [[TMP111]], float [[TMP108]], i32 1
+; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x float> [[TMP113]], float [[TMP110]], i32 3
+; CHECK-NEXT:    [[TMP115:%.*]] = fadd fast <4 x float> [[TMP42]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[TMP119]] = fadd fast <4 x float> [[TMP115]], [[TMP90]]
+; CHECK-NEXT:    [[TMP120]] = fadd fast <4 x float> [[TMP116]], [[TMP98]]
+; CHECK-NEXT:    [[TMP121]] = fadd fast <4 x float> [[TMP117]], [[TMP106]]
+; CHECK-NEXT:    [[TMP122]] = fadd fast <4 x float> [[TMP118]], [[TMP114]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]]
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]]
+; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[S_02:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD4:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[T1]], [[S_02]]
 ; CHECK-NEXT:    [[ADD4]] = fadd fast float [[ADD]], [[T2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ]
+; CHECK-NEXT:    [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD4_LCSSA]], [[LOOPEXIT]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index c02d61ff228f6..108c0677bdc80 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -71,38 +71,45 @@ define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtabl
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP6]], align 4, !llvm.access.group !1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP8]], align 4, !llvm.access.group !1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4, !llvm.access.group !1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP12]], align 4, !llvm.access.group !1
-; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD2]], <4 x i32>* [[TMP20]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP17]], align 4, !llvm.access.group !1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP18]], align 4, !llvm.access.group !1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP19]], align 4, !llvm.access.group !1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP24]], i32* [[TMP20]], align 4, !llvm.access.group !1
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD1]], <4 x i32>* [[TMP28]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index 163c761f00ff0..81eadef0536fa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -480,8 +480,8 @@ define i32 @test_count_bits(i8* %test_base) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
@@ -491,64 +491,62 @@ define i32 @test_count_bits(i8* %test_base) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[TMP13]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i8> poison, i8 [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 1
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP28]], i32 2
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i8> [[TMP32]], i8 [[TMP29]], i32 3
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, i8* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> poison, i8 [[TMP34]], i32 0
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 1
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 2
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i8> [[TMP40]], i8 [[TMP37]], i32 3
-; CHECK-NEXT:    [[TMP42:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP43:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK-NEXT:    [[TMP44:%.*]] = trunc <4 x i64> [[TMP42]] to <4 x i8>
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc <4 x i64> [[TMP43]] to <4 x i8>
-; CHECK-NEXT:    [[TMP46:%.*]] = lshr <4 x i8> [[TMP33]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = lshr <4 x i8> [[TMP41]], [[TMP45]]
-; CHECK-NEXT:    [[TMP48:%.*]] = and <4 x i8> [[TMP46]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP49:%.*]] = and <4 x i8> [[TMP47]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP48]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP52]] = add <4 x i32> [[VEC_PHI]], [[TMP50]]
-; CHECK-NEXT:    [[TMP53]] = add <4 x i32> [[VEC_PHI2]], [[TMP51]]
+; CHECK-NEXT:    [[TMP8:%.*]] = udiv i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = udiv i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = udiv i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = udiv i64 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, i8* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i8> poison, i8 [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i8> [[TMP28]], i8 [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i8> [[TMP29]], i8 [[TMP26]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, i8* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, i8* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, i8* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = urem <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP41:%.*]] = urem <4 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc <4 x i64> [[TMP40]] to <4 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = trunc <4 x i64> [[TMP41]] to <4 x i8>
+; CHECK-NEXT:    [[TMP44:%.*]] = lshr <4 x i8> [[TMP31]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = lshr <4 x i8> [[TMP39]], [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <4 x i8> [[TMP44]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP47:%.*]] = and <4 x i8> [[TMP45]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <4 x i8> [[TMP46]] to <4 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
+; CHECK-NEXT:    [[TMP50]] = add <4 x i32> [[VEC_PHI]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[VEC_PHI2]], [[TMP49]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]]
-; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
+; CHECK-NEXT:    [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -566,7 +564,7 @@ define i32 @test_count_bits(i8* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:


        


[llvm] d137f12 - [X86][LV] X86 does *not* prefer vectorized addressing

[llvm] d137f12 - [X86][LV] X86 does not prefer vectorized addressing