[llvm] 168a44a - [CostModel][X86] Improve extract/insert element costs (PR43605)

Thu Feb 27 07:54:35 PST 2020

Author: Simon Pilgrim
Date: 2020-02-27T15:54:13Z
New Revision: 168a44a70eb562f9a88a2e25b6ffd9cca3b25052

URL: https://github.com/llvm/llvm-project/commit/168a44a70eb562f9a88a2e25b6ffd9cca3b25052
DIFF: https://github.com/llvm/llvm-project/commit/168a44a70eb562f9a88a2e25b6ffd9cca3b25052.diff

LOG: [CostModel][X86] Improve extract/insert element costs (PR43605)

This tries to improve the accuracy of extract/insert element costs by accounting for subvector extraction/insertion for >128-bit vectors and the shuffling of elements to/from the 0'th index.

It also adds INSERTPS for f32 types and PINSR/PEXTR costs for integer types (at the moment we assume the same cost as MOVD/MOVQ - which isn't always true).

Differential Revision: https://reviews.llvm.org/D74976

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/arith-fp.ll
    llvm/test/Analysis/CostModel/X86/extend.ll
    llvm/test/Analysis/CostModel/X86/fptosi.ll
    llvm/test/Analysis/CostModel/X86/fptoui.ll
    llvm/test/Analysis/CostModel/X86/fround.ll
    llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
    llvm/test/Analysis/CostModel/X86/load_store.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
    llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
    llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
    llvm/test/Analysis/CostModel/X86/sitofp.ll
    llvm/test/Analysis/CostModel/X86/vector-extract.ll
    llvm/test/Analysis/CostModel/X86/vector-insert.ll
    llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
    llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
    llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
    llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
    llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
    llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
    llvm/test/Transforms/SLPVectorizer/X86/resched.ll
    llvm/test/Transforms/SLPVectorizer/X86/sext.ll
    llvm/test/Transforms/SLPVectorizer/X86/zext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7483d8f44179..59e93303046e 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2393,10 +2393,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
    };
 
   assert(Val->isVectorTy() && "This must be a vector type");
-
   Type *ScalarType = Val->getScalarType();
+  int RegisterFileMoveCost = 0;
 
-  if (Index != -1U) {
+  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+                       Opcode == Instruction::InsertElement)) {
     // Legalize the type.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
@@ -2405,17 +2406,32 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
       return 0;
 
     // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
+    unsigned NumElts = LT.second.getVectorNumElements();
+    unsigned SubNumElts = NumElts;
+    Index = Index % NumElts;
+
+    // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+    // For inserts, we also need to insert the subvector back.
+    if (LT.second.getSizeInBits() > 128) {
+      assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+      unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+      SubNumElts = NumElts / NumSubVecs;
+      if (SubNumElts <= Index) {
+        RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+        Index %= SubNumElts;
+      }
+    }
 
     if (Index == 0) {
       // Floating point scalars are already located in index #0.
+      // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+      // true for all.
       if (ScalarType->isFloatingPointTy())
-        return 0;
+        return RegisterFileMoveCost;
 
-      // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
-      if (ScalarType->isIntegerTy())
-        return 1;
+      // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+      if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+        return 1 + RegisterFileMoveCost;
     }
 
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2423,14 +2439,36 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     MVT MScalarTy = LT.second.getScalarType();
     if (ST->isSLM())
       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
-        return Entry->Cost;
+        return Entry->Cost + RegisterFileMoveCost;
+
+    // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+        (MScalarTy.isInteger() && ST->hasSSE41()))
+      return 1 + RegisterFileMoveCost;
+
+    // Assume insertps is relatively cheap on all targets.
+    if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+        Opcode == Instruction::InsertElement)
+      return 1 + RegisterFileMoveCost;
+
+    // For extractions we just need to shuffle the element to index 0, which
+    // should be very cheap (assume cost = 1). For insertions we need to shuffle
+    // the elements to its destination. In both cases we must handle the
+    // subvector move(s).
+    // TODO: Under what circumstances should we shuffle using the full width?
+    int ShuffleCost = 1;
+    if (Opcode == Instruction::InsertElement) {
+      Type *SubTy = VectorType::get(Val->getVectorElementType(), SubNumElts);
+      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+    }
+    int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+    return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
   }
 
   // Add to the base cost if we know that the extracted element of a vector is
   // destined to be moved to and used in the integer register file.
-  int RegisterFileMoveCost = 0;
   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
-    RegisterFileMoveCost = 1;
+    RegisterFileMoveCost += 1;
 
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
index 2fb6b53ba342..2c3b1844ca28 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -673,9 +673,9 @@ define i32 @fdiv(i32 %arg) {
 define i32 @frem(i32 %arg) {
 ; SSE1-LABEL: 'frem'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -684,9 +684,9 @@ define i32 @frem(i32 %arg) {
 ;
 ; SSE2-LABEL: 'frem'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -707,23 +707,23 @@ define i32 @frem(i32 %arg) {
 ; AVX-LABEL: 'frem'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = frem <16 x float> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'frem'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16F32 = frem <16 x float> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'frem'
@@ -1059,9 +1059,9 @@ define i32 @fcopysign(i32 %arg) {
 define i32 @fma(i32 %arg) {
 ; SSE1-LABEL: 'fma'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1070,9 +1070,9 @@ define i32 @fma(i32 %arg) {
 ;
 ; SSE2-LABEL: 'fma'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1093,12 +1093,12 @@ define i32 @fma(i32 %arg) {
 ; AVX-LABEL: 'fma'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fma'

diff  --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 03cc5e9eb923..1b2e254f821f 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -374,8 +374,8 @@ define i32 @zext_vXi1() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -399,8 +399,8 @@ define i32 @zext_vXi1() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -414,7 +414,7 @@ define i32 @zext_vXi1() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
@@ -424,8 +424,8 @@ define i32 @zext_vXi1() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -439,7 +439,7 @@ define i32 @zext_vXi1() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
@@ -474,8 +474,8 @@ define i32 @zext_vXi1() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -877,8 +877,8 @@ define i32 @sext_vXi1() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -902,8 +902,8 @@ define i32 @sext_vXi1() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -917,7 +917,7 @@ define i32 @sext_vXi1() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
@@ -927,8 +927,8 @@ define i32 @sext_vXi1() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -942,7 +942,7 @@ define i32 @sext_vXi1() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
@@ -977,8 +977,8 @@ define i32 @sext_vXi1() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>

diff  --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
index 0cd148df7be8..f4413449f58f 100644
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -13,9 +13,9 @@
 define i32 @fptosi_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_double_i64'
@@ -28,15 +28,15 @@ define i32 @fptosi_double_i64(i32 %arg) {
 ; AVX-LABEL: 'fptosi_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_double_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptosi_double_i64'
@@ -121,9 +121,9 @@ define i32 @fptosi_double_i16(i32 %arg) {
 define i32 @fptosi_double_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_double_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_double_i8'
@@ -164,10 +164,10 @@ define i32 @fptosi_double_i8(i32 %arg) {
 define i32 @fptosi_float_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_float_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_float_i64'
@@ -181,17 +181,17 @@ define i32 @fptosi_float_i64(i32 %arg) {
 ; AVX-LABEL: 'fptosi_float_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_float_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptosi_float_i64'
@@ -265,9 +265,9 @@ define i32 @fptosi_float_i16(i32 %arg) {
 define i32 @fptosi_float_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_float_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_float_i8'

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index b7af158bf5ed..33bd5438609d 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -13,9 +13,9 @@
 define i32 @fptoui_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i64'
@@ -28,15 +28,15 @@ define i32 @fptoui_double_i64(i32 %arg) {
 ; AVX-LABEL: 'fptoui_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_double_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptoui_double_i64'
@@ -63,9 +63,9 @@ define i32 @fptoui_double_i64(i32 %arg) {
 define i32 @fptoui_double_i32(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i32'
@@ -149,9 +149,9 @@ define i32 @fptoui_double_i16(i32 %arg) {
 define i32 @fptoui_double_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i8'
@@ -192,10 +192,10 @@ define i32 @fptoui_double_i8(i32 %arg) {
 define i32 @fptoui_float_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i64'
@@ -209,17 +209,17 @@ define i32 @fptoui_float_i64(i32 %arg) {
 ; AVX-LABEL: 'fptoui_float_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_float_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptoui_float_i64'
@@ -249,9 +249,9 @@ define i32 @fptoui_float_i64(i32 %arg) {
 define i32 @fptoui_float_i32(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i32'
@@ -335,9 +335,9 @@ define i32 @fptoui_float_i16(i32 %arg) {
 define i32 @fptoui_float_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i8'

diff  --git a/llvm/test/Analysis/CostModel/X86/fround.ll b/llvm/test/Analysis/CostModel/X86/fround.ll
index 5e2247356956..ef2eca180312 100644
--- a/llvm/test/Analysis/CostModel/X86/fround.ll
+++ b/llvm/test/Analysis/CostModel/X86/fround.ll
@@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.8.0"
 define i32 @ceil(i32 %arg) {
 ; SSE2-LABEL: 'ceil'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.ceil.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.ceil.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.ceil.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.ceil.v4f64(<4 x double> undef)
@@ -96,9 +96,9 @@ define i32 @ceil(i32 %arg) {
 define i32 @floor(i32 %arg) {
 ; SSE2-LABEL: 'floor'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.floor.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.floor.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.floor.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.floor.v4f64(<4 x double> undef)
@@ -176,9 +176,9 @@ define i32 @floor(i32 %arg) {
 define i32 @nearbyint(i32 %arg) {
 ; SSE2-LABEL: 'nearbyint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.nearbyint.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef)
@@ -256,9 +256,9 @@ define i32 @nearbyint(i32 %arg) {
 define i32 @rint(i32 %arg) {
 ; SSE2-LABEL: 'rint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
@@ -336,9 +336,9 @@ define i32 @rint(i32 %arg) {
 define i32 @trunc(i32 %arg) {
 ; SSE2-LABEL: 'trunc'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.trunc.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.trunc.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.trunc.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.trunc.v4f64(<4 x double> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
index 2a5950760b9a..afa4079dce76 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
@@ -22,7 +22,7 @@ for.end:                                          ; preds = %vector.body
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1':
-; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
+; CORE2: Cost Model: Found an estimated cost of 49 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1':
 ; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
@@ -50,7 +50,7 @@ for.end:                                          ; preds = %vector.body
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test2':
-; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
+; CORE2: Cost Model: Found an estimated cost of 49 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test2':
 ; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)

diff  --git a/llvm/test/Analysis/CostModel/X86/load_store.ll b/llvm/test/Analysis/CostModel/X86/load_store.ll
index ccf110a3a371..5d6a5975c596 100644
--- a/llvm/test/Analysis/CostModel/X86/load_store.ll
+++ b/llvm/test/Analysis/CostModel/X86/load_store.ll
@@ -72,10 +72,10 @@ define i32 @loads(i32 %arg) {
   ;CHECK: cost of 3 {{.*}} load
   load <3 x i64>, <3 x i64>* undef, align 4
 
-  ;CHECK: cost of 10 {{.*}} load
+  ;CHECK: cost of 12 {{.*}} load
   load <5 x i32>, <5 x i32>* undef, align 4
 
-  ;CHECK: cost of 10 {{.*}} load
+  ;CHECK: cost of 14 {{.*}} load
   load <5 x i64>, <5 x i64>* undef, align 4
 
   ret i32 undef

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 076bfa3c7061..e2cc0af9d5b5 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -9,32 +9,59 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze | FileCheck %s --check-prefixes=CHECK,AVX512,SKX
 
 define i32 @masked_load() {
-; SSE-LABEL: 'masked_load'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1148 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 574 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 287 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
@@ -53,12 +80,12 @@ define i32 @masked_load() {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -80,12 +107,12 @@ define i32 @masked_load() {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -151,32 +178,59 @@ define i32 @masked_load() {
 }
 
 define i32 @masked_store() {
-; SSE-LABEL: 'masked_store'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
@@ -195,12 +249,12 @@ define i32 @masked_store() {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -222,12 +276,12 @@ define i32 @masked_store() {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -293,83 +347,110 @@ define i32 @masked_store() {
 }
 
 define i32 @masked_gather() {
-; SSE-LABEL: 'masked_gather'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -391,19 +472,19 @@ define i32 @masked_gather() {
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
@@ -411,19 +492,19 @@ define i32 @masked_gather() {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -445,12 +526,12 @@ define i32 @masked_gather() {
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -489,63 +570,90 @@ define i32 @masked_gather() {
 }
 
 define i32 @masked_scatter() {
-; SSE-LABEL: 'masked_scatter'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
@@ -553,19 +661,19 @@ define i32 @masked_scatter() {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -587,12 +695,12 @@ define i32 @masked_scatter() {
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -631,86 +739,140 @@ define i32 @masked_scatter() {
 }
 
 define i32 @masked_expandload() {
-; SSE-LABEL: 'masked_expandload'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; KNL-LABEL: 'masked_expandload'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_expandload'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
   %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
@@ -746,86 +908,140 @@ define i32 @masked_expandload() {
 }
 
 define i32 @masked_compressstore() {
-; SSE-LABEL: 'masked_compressstore'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 311 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 155 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_compressstore'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; KNL-LABEL: 'masked_compressstore'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_compressstore'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
   call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
@@ -863,7 +1079,7 @@ define i32 @masked_compressstore() {
 define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
@@ -887,10 +1103,15 @@ define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
 }
 
 define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
-; SSE-LABEL: 'test2'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -908,10 +1129,15 @@ define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
 }
 
 define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
-; SSE-LABEL: 'test3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -929,10 +1155,15 @@ define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
 }
 
 define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
-; SSE-LABEL: 'test4'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -960,10 +1191,15 @@ define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %d
 }
 
 define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
-; SSE-LABEL: 'test5'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -981,10 +1217,15 @@ define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 }
 
 define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
-; SSE-LABEL: 'test6'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1002,10 +1243,15 @@ define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
 }
 
 define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
-; SSE-LABEL: 'test7'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1023,10 +1269,15 @@ define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %d
 }
 
 define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
-; SSE-LABEL: 'test8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1044,9 +1295,13 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
 }
 
 define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
-; SSE-LABEL: 'test_gather_2f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1069,9 +1324,13 @@ define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x
 }
 
 define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
-; SSE-LABEL: 'test_gather_4i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1098,9 +1357,13 @@ define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %
 }
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0)  {
-; SSE-LABEL: 'test_gather_4i32_const_mask'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1130,7 +1393,7 @@ define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind)
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
@@ -1142,13 +1405,13 @@ define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind)
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
@@ -1174,7 +1437,7 @@ define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
@@ -1186,13 +1449,13 @@ define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
@@ -1218,7 +1481,7 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i3
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -1230,13 +1493,13 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i3
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -1260,11 +1523,11 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i3
 
 define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -1280,7 +1543,7 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -1288,7 +1551,7 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -1318,20 +1581,28 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind
 }
 
 define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
-; SSE-LABEL: 'test_scatter_16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -1339,7 +1610,7 @@ define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i3
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -1347,7 +1618,7 @@ define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i3
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -1368,12 +1639,16 @@ define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i3
 }
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
-; SSE-LABEL: 'test_scatter_8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
@@ -1385,9 +1660,13 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
 }
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
-; SSE-LABEL: 'test_scatter_4i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
@@ -1409,7 +1688,7 @@ define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask)
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
@@ -1459,7 +1738,7 @@ define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'

diff  --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 39ad7746b350..7a5ff281b759 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -358,7 +358,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -382,7 +382,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -417,7 +417,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -441,7 +441,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -476,7 +476,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -500,7 +500,7 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +709,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +733,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +828,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +852,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>

diff  --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index c0e6787236ec..cf60c36eb63e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -51,11 +51,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -109,11 +109,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;

diff  --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll
index b6eb4762ba9c..5799e636c54b 100644
--- a/llvm/test/Analysis/CostModel/X86/sitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll
@@ -116,7 +116,7 @@ define i32 @sitofp_i64_double() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sitofp_i64_double'
@@ -241,8 +241,8 @@ define i32 @sitofp_i64_float() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sitofp_i64_float'

diff  --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
index 104f9c828695..ad552e452a96 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
@@ -34,12 +34,12 @@ define i32 @extract_double(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_double'
@@ -48,12 +48,12 @@ define i32 @extract_double(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f64_a = extractelement <2 x double> undef, i32 %arg
@@ -104,12 +104,12 @@ define i32 @extract_float(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_float'
@@ -123,12 +123,12 @@ define i32 @extract_float(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f32_a = extractelement <2 x float> undef, i32 %arg
@@ -158,43 +158,43 @@ define i32 @extract_i64(i32 %arg) {
 ; SSE2-LABEL: 'extract_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i64'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i64'
@@ -217,12 +217,12 @@ define i32 @extract_i64(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_i64'
@@ -231,12 +231,12 @@ define i32 @extract_i64(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i64'
@@ -288,58 +288,58 @@ define i32 @extract_i32(i32 %arg) {
 ; SSE2-LABEL: 'extract_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i32'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i32'
@@ -371,13 +371,13 @@ define i32 @extract_i32(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_i32'
@@ -390,13 +390,13 @@ define i32 @extract_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i32'
@@ -544,36 +544,55 @@ define i32 @extract_i16(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX512-LABEL: 'extract_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-LABEL: 'extract_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'extract_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
@@ -639,73 +658,73 @@ define i32 @extract_i8(i32 %arg) {
 ; SSE2-LABEL: 'extract_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i8'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i8'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i8'
@@ -742,43 +761,67 @@ define i32 @extract_i8(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX512-LABEL: 'extract_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-LABEL: 'extract_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'extract_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg

diff  --git a/llvm/test/Analysis/CostModel/X86/vector-insert.ll b/llvm/test/Analysis/CostModel/X86/vector-insert.ll
index 78729213b0d4..6802fd2a7b6b 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-insert.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert.ll
@@ -9,8 +9,8 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,GLM
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,BTVER2
 
 define i32 @insert_double(i32 %arg) {
@@ -34,12 +34,12 @@ define i32 @insert_double(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'insert_double'
@@ -48,12 +48,12 @@ define i32 @insert_double(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f64_a = insertelement <2 x double> undef, double undef, i32 %arg
@@ -74,24 +74,81 @@ define i32 @insert_double(i32 %arg) {
 }
 
 define i32 @insert_float(i32 %arg) {
-; SSE-LABEL: 'insert_float'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_float'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_float'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_float'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_float'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'insert_float'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
@@ -103,13 +160,13 @@ define i32 @insert_float(i32 %arg) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'insert_float'
@@ -122,14 +179,52 @@ define i32 @insert_float(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_float'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_float'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
   %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
@@ -155,19 +250,117 @@ define i32 @insert_float(i32 %arg) {
 }
 
 define i32 @insert_i64(i32 %arg) {
-; CHECK-LABEL: 'insert_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i64'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i64'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
   %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
@@ -187,24 +380,157 @@ define i32 @insert_i64(i32 %arg) {
 }
 
 define i32 @insert_i32(i32 %arg) {
-; CHECK-LABEL: 'insert_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i32'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i32'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
   %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
@@ -230,24 +556,81 @@ define i32 @insert_i32(i32 %arg) {
 }
 
 define i32 @insert_i16(i32 %arg) {
-; CHECK-LABEL: 'insert_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'insert_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'insert_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'insert_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
   %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
@@ -272,29 +655,221 @@ define i32 @insert_i16(i32 %arg) {
 }
 
 define i32 @insert_i8(i32 %arg) {
-; CHECK-LABEL: 'insert_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i8'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i8'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'insert_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'insert_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v16i8_a  = insertelement <16 x i8> undef, i8 undef, i32 %arg
   %v16i8_0  = insertelement <16 x i8> undef, i8 undef, i32 0

diff  --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index e9c1260387f9..8b8d0bfd4de3 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -455,11 +455,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v2i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; AVX-LABEL: 'splatvar_shift_v2i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
@@ -486,11 +492,17 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -535,11 +547,17 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -584,11 +602,35 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -597,11 +639,17 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -646,11 +694,17 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -861,7 +915,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %shift = ashr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -898,7 +952,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -953,7 +1007,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift

diff  --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 14aeb7907179..d474b99e987a 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -467,11 +467,35 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-LABEL: 'splatvar_shift_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v2i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v2i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v2i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
   %insert = insertelement <2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -480,11 +504,17 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -529,11 +559,17 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -578,11 +614,35 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -591,11 +651,17 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -640,11 +706,17 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -855,7 +927,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %shift = lshr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -892,7 +964,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %shift = lshr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -947,7 +1019,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift

diff  --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index a38f1f18cc84..145c7e71efa1 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -459,11 +459,35 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-LABEL: 'splatvar_shift_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v2i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v2i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v2i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
   %insert = insertelement <2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -472,11 +496,17 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -521,11 +551,17 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -570,11 +606,35 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -583,11 +643,17 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -632,11 +698,17 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -847,7 +919,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %shift = shl <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -884,7 +956,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %shift = shl <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -939,7 +1011,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %shift = shl <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index c8f7c22c410a..f8bf35cfcfaa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,37 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLOW
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SLOW
 
 define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
 ; NORMAL-LABEL: @foo(
 ; NORMAL-NEXT:  entry:
-; NORMAL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NORMAL:       vector.ph:
-; NORMAL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NORMAL:       vector.body:
-; NORMAL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NORMAL-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; NORMAL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; NORMAL-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
-; NORMAL-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; NORMAL-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; NORMAL-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; NORMAL-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
-; NORMAL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; NORMAL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; NORMAL-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; NORMAL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; NORMAL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; NORMAL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; NORMAL:       middle.block:
-; NORMAL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; NORMAL:       scalar.ph:
 ; NORMAL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NORMAL:       for.cond.cleanup:
 ; NORMAL-NEXT:    ret void
 ; NORMAL:       for.body:
-; NORMAL-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
+; NORMAL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NORMAL-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; NORMAL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; NORMAL-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; NORMAL-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; NORMAL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; NORMAL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; NORMAL-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; NORMAL-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; NORMAL-NEXT:    store i32 [[ADD4]], i32* [[ARRAYIDX6]], align 4
+; NORMAL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NORMAL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; NORMAL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+; AVX-LABEL: @foo(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
+; AVX-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
+; AVX-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX:       middle.block:
+; AVX-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.cond.cleanup:
+; AVX-NEXT:    ret void
+; AVX:       for.body:
+; AVX-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
 ;
 ; SLOW-LABEL: @foo(
 ; SLOW-NEXT:  entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index b406b3f91fa2..c173bd9f8fad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -18,77 +18,59 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7
-; CHECK-NEXT:    [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP18]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP21]] = add <4 x i32> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP37]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP37]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], [[TMP24]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
index b022dd7d9155..992b01f20ef3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
@@ -5,13 +5,15 @@ define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
 ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
-; CHECK-NEXT:    [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
-; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
-; CHECK-NEXT:    [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
-; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> undef, half [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 23d1634fdb6c..085b7137f2b2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: @add_sub_v8i32(
@@ -101,44 +101,11 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
-; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @ashr_shl_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX512-LABEL: @ashr_shl_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-LABEL: @ashr_shl_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -184,14 +151,6 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
-; SLM-LABEL: @ashr_shl_v8i32_const(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
@@ -243,66 +202,31 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
 ; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
 ; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
-; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
-; SLM-LABEL: @ashr_lshr_shl_v8i32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
-; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SLM-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; SLM-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
index ad566cb3411e..06008d82fb73 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -1,15 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer < %s -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 ; Verify that the SLP vectorizer is able to figure out that commutativity
 ; offers the possibility to splat/broadcast %c and thus make it profitable
 ; to vectorize this case
 
-
-; ModuleID = 'bugpoint-reduced-simplified.bc'
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
 @cle = external unnamed_addr global [32 x i8], align 16
 @cle32 = external unnamed_addr global [32 x i32], align 16
 
@@ -18,29 +15,64 @@ target triple = "x86_64-apple-macosx10.11.0"
 ; commutativity property of `xor`.
 
 define void @splat(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
-; CHECK-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @splat(
+; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
+; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
+; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
+; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
+; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
+; SSE-NEXT:    store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
+; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
+; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
+; SSE-NEXT:    store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
+; SSE-NEXT:    [[TMP9:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
+; SSE-NEXT:    [[TMP10:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
+; SSE-NEXT:    [[TMP11:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
+; SSE-NEXT:    [[TMP12:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
+; SSE-NEXT:    [[TMP13:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
+; SSE-NEXT:    [[TMP14:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
+; SSE-NEXT:    [[TMP15:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
+; SSE-NEXT:    [[TMP16:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @splat(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; AVX-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
+; AVX-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
+; AVX-NEXT:    ret void
 ;
   %1 = xor i8 %c, %a
   store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
@@ -83,22 +115,37 @@ define void @splat(i8 %a, i8 %b, i8 %c) {
 ; leveraging the commutativity property of `xor`.
 
 define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: @same_opcode_on_one_side(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @same_opcode_on_one_side(
+; SSE-NEXT:    [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    [[ADD2:%.*]] = add i32 [[C]], [[A]]
+; SSE-NEXT:    [[ADD3:%.*]] = add i32 [[A]], [[C]]
+; SSE-NEXT:    [[ADD4:%.*]] = add i32 [[C]], [[A]]
+; SSE-NEXT:    [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
+; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
+; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
+; SSE-NEXT:    [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
+; SSE-NEXT:    store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
+; SSE-NEXT:    [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
+; SSE-NEXT:    store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @same_opcode_on_one_side(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
+; AVX-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
+; AVX-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
+; AVX-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
+; AVX-NEXT:    ret void
 ;
   %add1 = add i32 %c, %a
   %add2 = add i32 %c, %a

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 71f72a93075b..ae79a7eb0e80 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -221,14 +221,10 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R07]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
@@ -288,33 +284,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[R03]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v4i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -336,33 +310,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R07]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @test_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index b7e487eed9eb..c1c2ac544b15 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -221,14 +221,10 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R07]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
@@ -288,33 +284,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[R03]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v4i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -336,33 +310,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R07]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @test_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 711347e62368..632bec58d5c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -214,77 +214,44 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
 
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
-; BIGTHRESH-LABEL: @simple_select_no_users(
-; BIGTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; BIGTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; BIGTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; BIGTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; BIGTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; BIGTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; BIGTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; BIGTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; BIGTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; BIGTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; BIGTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; BIGTHRESH-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
-; BIGTHRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; BIGTHRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; BIGTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
-; BIGTHRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; BIGTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; BIGTHRESH-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; BIGTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
-; BIGTHRESH-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; BIGTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
-; BIGTHRESH-NEXT:    ret <4 x float> [[RD]]
-;
-; ZEROTHRESH-LABEL: @simple_select_no_users(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; ZEROTHRESH-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
-; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
-; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2
-; ZEROTHRESH-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3
-; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+; ANY-LABEL: @simple_select_no_users(
+; ANY-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ANY-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ANY-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ANY-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ANY-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ANY-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ANY-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ANY-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ANY-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ANY-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ANY-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ANY-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; ANY-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; ANY-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; ANY-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; ANY-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; ANY-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; ANY-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; ANY-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; ANY-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; ANY-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; ANY-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; ANY-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; ANY-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; ANY-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
+; ANY-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; ANY-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
+; ANY-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; ANY-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; ANY-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
+; ANY-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; ANY-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
+; ANY-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
+; ANY-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
+; ANY-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
+; ANY-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; ANY-NEXT:    ret <4 x float> [[RD]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 4241cb922ea2..1a8d2bcec55f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -11,21 +13,34 @@ target triple = "x86_64-unknown-linux-gnu"
 ; zero-extend the roots back to their original sizes.
 ;
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
-; CHECK-LABEL: @PR31243_zext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    ret i8 [[TMP8]]
+; SSE-LABEL: @PR31243_zext(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; SSE-NEXT:    ret i8 [[TMP8]]
+;
+; AVX-LABEL: @PR31243_zext(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; AVX-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX-NEXT:    ret i8 [[TMP8]]
 ;
 entry:
   %tmp0 = zext i8 %v0 to i32
@@ -54,22 +69,35 @@ entry:
 ;        ensure correctness.
 ;
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
-; CHECK-LABEL: @PR31243_sext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    ret i8 [[TMP8]]
+; SSE-LABEL: @PR31243_sext(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; SSE-NEXT:    ret i8 [[TMP8]]
+;
+; AVX-LABEL: @PR31243_sext(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX-NEXT:    ret i8 [[TMP8]]
 ;
 entry:
   %tmp0 = sext i8 %v0 to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 2c53b1afbd2a..e8dc343af6b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 %class.1 = type { %class.2 }
 %class.2 = type { %"class.3" }
@@ -10,23 +12,40 @@ $_ZN1C10SwitchModeEv = comdat any
 
 ; Function Attrs: uwtable
 define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
-; CHECK-LABEL: @_ZN1C10SwitchModeEv(
-; CHECK-NEXT:  for.body.lr.ph.i:
-; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
-; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
-; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
-; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @_ZN1C10SwitchModeEv(
+; SSE-NEXT:  for.body.lr.ph.i:
+; SSE-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; SSE-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; SSE-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; SSE-NEXT:    [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8
+; SSE-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8
+; SSE-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; SSE-NEXT:    [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
+; SSE-NEXT:    [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
+; SSE-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; SSE-NEXT:    store i64 [[AND_2]], i64* [[BAR3]], align 8
+; SSE-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    store i64 [[AND_1]], i64* [[BAR4]], align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @_ZN1C10SwitchModeEv(
+; AVX-NEXT:  for.body.lr.ph.i:
+; AVX-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; AVX-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; AVX-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; AVX-NEXT:    ret void
 ;
 for.body.lr.ph.i:
   %or.1 = or i64 undef, 1
@@ -47,34 +66,65 @@ for.body.lr.ph.i:
 
 ; Function Attrs: norecurse nounwind uwtable
 define void @pr35497() local_unnamed_addr #0 {
-; CHECK-LABEL: @pr35497(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 undef, undef
-; CHECK-NEXT:    store i64 [[ADD]], i64* undef, align 1
-; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
-; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @pr35497(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
+; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
+; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; SSE-NEXT:    [[AND_1:%.*]] = shl i64 undef, 2
+; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
+; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 undef, 6
+; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
+; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
+; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
+; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
+; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
+; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
+; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
+; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
+; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
+; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @pr35497(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; AVX-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
+; AVX-NEXT:    [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
+; AVX-NEXT:    ret void
 ;
 entry:
   %0 = load i64, i64* undef, align 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index 2d71b6efed07..ff46b4bbe7c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -110,7 +110,7 @@ for.body:                                         ; preds = %for.body, %entry
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
-  ; YAML-NEXT:   - Cost:            '-8'
+  ; YAML-NEXT:   - Cost:            '-5'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '4'
 
@@ -120,7 +120,7 @@ for.body:                                         ; preds = %for.body, %entry
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-  ; YAML-NEXT:   - Cost:            '-4'
+  ; YAML-NEXT:   - Cost:            '-7'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '1'
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 28bc95e2f4ca..50b4e8a1c404 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -12,70 +12,84 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8
+; CHECK-NEXT:    [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1
+; CHECK-NEXT:    store i8 [[CONV_I_I1199]], i8* [[TMP0]], align 1
+; CHECK-NEXT:    [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8
+; CHECK-NEXT:    [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    store i8 [[CONV_1_I_I]], i8* [[ARRAYIDX_I_I7_1_I_I]], align 1
+; CHECK-NEXT:    [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8
+; CHECK-NEXT:    [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    store i8 [[CONV_2_I_I]], i8* [[ARRAYIDX_I_I7_2_I_I]], align 1
+; CHECK-NEXT:    [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8
+; CHECK-NEXT:    [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+; CHECK-NEXT:    store i8 [[CONV_3_I_I]], i8* [[ARRAYIDX_I_I7_3_I_I]], align 1
+; CHECK-NEXT:    [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8
+; CHECK-NEXT:    [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+; CHECK-NEXT:    store i8 [[CONV_4_I_I]], i8* [[ARRAYIDX_I_I7_4_I_I]], align 1
+; CHECK-NEXT:    [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8
+; CHECK-NEXT:    [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+; CHECK-NEXT:    store i8 [[CONV_5_I_I]], i8* [[ARRAYIDX_I_I7_5_I_I]], align 1
+; CHECK-NEXT:    [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8
+; CHECK-NEXT:    [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    store i8 [[CONV_6_I_I]], i8* [[ARRAYIDX_I_I7_6_I_I]], align 1
+; CHECK-NEXT:    [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8
+; CHECK-NEXT:    [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    store i8 [[CONV_7_I_I]], i8* [[ARRAYIDX_I_I7_7_I_I]], align 1
+; CHECK-NEXT:    [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8
+; CHECK-NEXT:    [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+; CHECK-NEXT:    store i8 [[CONV_8_I_I]], i8* [[ARRAYIDX_I_I7_8_I_I]], align 1
+; CHECK-NEXT:    [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8
+; CHECK-NEXT:    [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+; CHECK-NEXT:    store i8 [[CONV_9_I_I]], i8* [[ARRAYIDX_I_I7_9_I_I]], align 1
+; CHECK-NEXT:    [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8
+; CHECK-NEXT:    [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    store i8 [[CONV_10_I_I]], i8* [[ARRAYIDX_I_I7_10_I_I]], align 1
+; CHECK-NEXT:    [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8
+; CHECK-NEXT:    [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    store i8 [[CONV_11_I_I]], i8* [[ARRAYIDX_I_I7_11_I_I]], align 1
+; CHECK-NEXT:    [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8
+; CHECK-NEXT:    [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+; CHECK-NEXT:    store i8 [[CONV_12_I_I]], i8* [[ARRAYIDX_I_I7_12_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8
+; CHECK-NEXT:    [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    store i8 [[CONV_13_I_I]], i8* [[ARRAYIDX_I_I7_13_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8
+; CHECK-NEXT:    [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+; CHECK-NEXT:    store i8 [[CONV_14_I_I]], i8* [[ARRAYIDX_I_I7_14_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8
+; CHECK-NEXT:    [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    store i8 [[CONV_15_I_I]], i8* [[ARRAYIDX_I_I7_15_I_I]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index c3eba4701e99..69459788fbf2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -43,23 +43,40 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE-LABEL: @loadext_4i8_to_4i32(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -115,58 +132,22 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i8_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i8_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i8_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -805,58 +786,22 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i16_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i16_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i16_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1058,58 +1003,22 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i32_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i32_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i32_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i32_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index ead4ffdeb0fc..45e5d7034575 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -11,15 +11,26 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE-LABEL: @loadext_2i8_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -114,76 +125,57 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE-LABEL: @loadext_4i8_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i8_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i8_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i8_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -701,15 +693,26 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE-LABEL: @loadext_2i16_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -804,76 +807,57 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE-LABEL: @loadext_4i16_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i16_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i16_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i16_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1025,15 +1009,26 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE-LABEL: @loadext_2i32_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1057,76 +1052,57 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE-LABEL: @loadext_4i32_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i32_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i32_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i32_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i32_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2