[llvm] 5c166f1 - [x86] make SLM extract vector element more expensive than default

Sanjay Patel via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 27 11:10:23 PST 2019


Author: Sanjay Patel
Date: 2019-11-27T14:08:56-05:00
New Revision: 5c166f1d1969e9c1e5b72aa672add429b9c22b53

URL: https://github.com/llvm/llvm-project/commit/5c166f1d1969e9c1e5b72aa672add429b9c22b53
DIFF: https://github.com/llvm/llvm-project/commit/5c166f1d1969e9c1e5b72aa672add429b9c22b53.diff

LOG: [x86] make SLM extract vector element more expensive than default

I'm not sure what the effect of this change will be on all of the affected
tests or a larger benchmark, but it fixes the horizontal add/sub problems
noted here:
https://reviews.llvm.org/D59710?vs=227972&id=228095&whitespace=ignore-most#toc

The costs are based on reciprocal throughput numbers in Agner's tables for
PEXTR*; these appear to be very slow ops on Silvermont.

This is a small step towards the larger motivation discussed in PR43605:
https://bugs.llvm.org/show_bug.cgi?id=43605

Also, it seems likely that insert/extract is the source of perf regressions on
other CPUs (up to 30%) that were cited as part of the reason to revert D59710,
so maybe we'll extend the table-based approach to other subtargets.

Differential Revision: https://reviews.llvm.org/D70607

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/fptosi.ll
    llvm/test/Analysis/CostModel/X86/fptoui.ll
    llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
    llvm/test/Analysis/CostModel/X86/vector-extract.ll
    llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/sext.ll
    llvm/test/Transforms/SLPVectorizer/X86/zext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0b3a5319baac..f64fedd8cbb6 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2377,6 +2377,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  static const CostTblEntry SLMCostTbl[] = {
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
+   };
+
   assert(Val->isVectorTy() && "This must be a vector type");
 
   Type *ScalarType = Val->getScalarType();
@@ -2396,6 +2403,13 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     // Floating point scalars are already located in index #0.
     if (ScalarType->isFloatingPointTy() && Index == 0)
       return 0;
+
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Unexpected vector opcode");
+    MVT MScalarTy = LT.second.getScalarType();
+    if (ST->isSLM())
+      if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
+        return LT.first * Entry->Cost;
   }
 
   // Add to the base cost if we know that the extracted element of a vector is

diff  --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
index 7583d6e60c80..bb03b56e48f6 100644
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -6,7 +6,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
@@ -39,6 +39,13 @@ define i32 @fptosi_double_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
@@ -75,6 +82,13 @@ define i32 @fptosi_double_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
@@ -111,6 +125,13 @@ define i32 @fptosi_double_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
@@ -147,6 +168,13 @@ define i32 @fptosi_double_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
@@ -194,6 +222,14 @@ define i32 @fptosi_float_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
@@ -218,6 +254,13 @@ define i32 @fptosi_float_i32(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
@@ -254,6 +297,13 @@ define i32 @fptosi_float_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
@@ -290,6 +340,13 @@ define i32 @fptosi_float_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index 078b21ba7203..cdb3e5486604 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -6,7 +6,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
@@ -39,6 +39,13 @@ define i32 @fptoui_double_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
@@ -75,6 +82,13 @@ define i32 @fptoui_double_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
@@ -111,6 +125,13 @@ define i32 @fptoui_double_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
@@ -147,6 +168,13 @@ define i32 @fptoui_double_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
@@ -194,6 +222,14 @@ define i32 @fptoui_float_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
@@ -232,6 +268,13 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
@@ -268,6 +311,13 @@ define i32 @fptoui_float_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
@@ -304,6 +354,13 @@ define i32 @fptoui_float_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>

diff  --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 3ceba32744b6..4ed509ff9db0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -8,8 +8,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,GLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
 ;
@@ -270,64 +270,123 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512
 }
 
 define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi16'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -506,6 +565,124 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'test_vXi16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'test_vXi16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
 ; BTVER2-LABEL: 'test_vXi16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
@@ -863,125 +1040,6 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
 ; AVX-LABEL: 'test_vXi8'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
@@ -1339,6 +1397,244 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'test_vXi8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'test_vXi8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
 ; BTVER2-LABEL: 'test_vXi8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>

diff  --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
index 62123c422a8f..ddb3654fbc6a 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
@@ -9,8 +9,8 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,SLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,GLM
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
 define i32 @extract_double(i32 %arg) {
@@ -188,19 +188,117 @@ define i32 @extract_float(i32 %arg) {
 }
 
 define i32 @extract_i64(i32 %arg) {
-; CHECK-LABEL: 'extract_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i64'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i64'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
@@ -234,24 +332,157 @@ define i32 @extract_i64(i32 %arg) {
 }
 
 define i32 @extract_i32(i32 %arg) {
-; CHECK-LABEL: 'extract_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i32'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i32'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
@@ -296,24 +527,157 @@ define i32 @extract_i32(i32 %arg) {
 }
 
 define i32 @extract_i16(i32 %arg) {
-; CHECK-LABEL: 'extract_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i16'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i16'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
@@ -357,29 +721,197 @@ define i32 @extract_i16(i32 %arg) {
 }
 
 define i32 @extract_i8(i32 %arg) {
-; CHECK-LABEL: 'extract_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i8'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i8'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 9294c92b5759..f12f35702156 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLOW
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SLOW
 
 ; NORMAL-LABEL: foo
 ; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
@@ -8,10 +8,10 @@
 ; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
 
-; ATOM-LABEL: foo
-; ATOM: load i32
-; ATOM: load i32
-; ATOM: store i32
+; SLOW-LABEL: foo
+; SLOW: load i32
+; SLOW: load i32
+; SLOW: store i32
 define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 8f8b1d443da8..9ee016e4331b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -35,30 +35,9 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SLM-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SLM-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SLM-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SLM-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SLM-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    ret <8 x float> [[R7]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -268,11 +247,50 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 }
 
 define <8 x i32> @sext_zext(<8 x i16> %a) {
-; CHECK-LABEL: @sext_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @sext_zext(
+; SSE-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @sext_zext(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i16> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sext i16 [[A0]] to i32
+; SLM-NEXT:    [[AB1:%.*]] = sext i16 [[A1]] to i32
+; SLM-NEXT:    [[AB2:%.*]] = sext i16 [[A2]] to i32
+; SLM-NEXT:    [[AB3:%.*]] = sext i16 [[A3]] to i32
+; SLM-NEXT:    [[AB4:%.*]] = zext i16 [[A4]] to i32
+; SLM-NEXT:    [[AB5:%.*]] = zext i16 [[A5]] to i32
+; SLM-NEXT:    [[AB6:%.*]] = zext i16 [[A6]] to i32
+; SLM-NEXT:    [[AB7:%.*]] = zext i16 [[A7]] to i32
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @sext_zext(
+; AVX-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @sext_zext(
+; AVX512-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -383,26 +401,24 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; SLM-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
 ; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
 ; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
 ; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
 ; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 44729b4a8d5a..23d1634fdb6c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -75,42 +75,11 @@ define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: @add_mul_v4i32(
-; SSE-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; SSE-NEXT:    ret <4 x i32> [[R3]]
-;
-; SLM-LABEL: @add_mul_v4i32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3
-; SLM-NEXT:    [[AB0:%.*]] = mul i32 [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = add i32 [[A1]], [[B1]]
-; SLM-NEXT:    [[AB2:%.*]] = add i32 [[A2]], [[B2]]
-; SLM-NEXT:    [[AB3:%.*]] = mul i32 [[A3]], [[B3]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[R3]]
-;
-; AVX-LABEL: @add_mul_v4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT:    ret <4 x i32> [[R3]]
-;
-; AVX512-LABEL: @add_mul_v4i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX512-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-LABEL: @add_mul_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index b02244f9614b..71f72a93075b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
 }
 
 define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = add i64 [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %a0 = extractelement <2 x i64> %a, i32 0
   %a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-NEXT:    ret <4 x i64> [[R03]]
 ;
 ; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v4i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    ret <8 x i32> [[R07]]
 ;
 ; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R07]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index d6e44aa1d6a0..b7e487eed9eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
 }
 
 define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = sub i64 [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = sub i64 [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %a0 = extractelement <2 x i64> %a, i32 0
   %a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-NEXT:    ret <4 x i64> [[R03]]
 ;
 ; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v4i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    ret <8 x i32> [[R07]]
 ;
 ; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R07]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index f3404831e213..c3eba4701e99 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -11,26 +11,15 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,40 +43,23 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i32> [[V3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -125,40 +97,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i8_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -232,34 +187,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i16> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -296,34 +314,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -360,58 +441,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT:    ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT:    [[X8:%.*]] = sext i8 [[I8]] to i16
+; SLM-NEXT:    [[X9:%.*]] = sext i8 [[I9]] to i16
+; SLM-NEXT:    [[X10:%.*]] = sext i8 [[I10]] to i16
+; SLM-NEXT:    [[X11:%.*]] = sext i8 [[I11]] to i16
+; SLM-NEXT:    [[X12:%.*]] = sext i8 [[I12]] to i16
+; SLM-NEXT:    [[X13:%.*]] = sext i8 [[I13]] to i16
+; SLM-NEXT:    [[X14:%.*]] = sext i8 [[I14]] to i16
+; SLM-NEXT:    [[X15:%.*]] = sext i8 [[I15]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT:    ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT:    ret <16 x i16> [[V15]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -484,26 +684,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -527,22 +716,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -563,40 +787,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i16_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -670,34 +877,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = sext i16 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = sext i16 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = sext i16 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = sext i16 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -738,26 +1008,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -781,40 +1040,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i32_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index d82aeb856768..ead4ffdeb0fc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -11,26 +11,15 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,22 +43,57 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; CHECK-LABEL: @loadext_4i8_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -90,40 +114,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i8_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -197,34 +204,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i16> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -261,34 +331,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -325,58 +458,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT:    ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT:    [[X8:%.*]] = zext i8 [[I8]] to i16
+; SLM-NEXT:    [[X9:%.*]] = zext i8 [[I9]] to i16
+; SLM-NEXT:    [[X10:%.*]] = zext i8 [[I10]] to i16
+; SLM-NEXT:    [[X11:%.*]] = zext i8 [[I11]] to i16
+; SLM-NEXT:    [[X12:%.*]] = zext i8 [[I12]] to i16
+; SLM-NEXT:    [[X13:%.*]] = zext i8 [[I13]] to i16
+; SLM-NEXT:    [[X14:%.*]] = zext i8 [[I14]] to i16
+; SLM-NEXT:    [[X15:%.*]] = zext i8 [[I15]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT:    ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT:    ret <16 x i16> [[V15]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -449,26 +701,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -492,22 +733,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -528,40 +804,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i16_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -635,34 +894,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = zext i16 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = zext i16 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = zext i16 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = zext i16 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -703,26 +1025,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -746,40 +1057,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i32_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1


        


More information about the llvm-commits mailing list