[llvm] 872a950 - [CostModel] Treat 'widen subvector' patterns as zero cost

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 2 03:43:26 PDT 2021


Author: Simon Pilgrim
Date: 2021-08-02T11:43:10+01:00
New Revision: 872a950033d3cacd8c489f8be68d7c25a11a8746

URL: https://github.com/llvm/llvm-project/commit/872a950033d3cacd8c489f8be68d7c25a11a8746
DIFF: https://github.com/llvm/llvm-project/commit/872a950033d3cacd8c489f8be68d7c25a11a8746.diff

LOG: [CostModel] Treat 'widen subvector' patterns as zero cost

As discussed on D107228, widening a subvector by inserting the whole subvector into the bottom a larger undef vector should always be cheap enough that we can treat it as zero cost.

NOTE: If this proves to cause issues we have the option of introducing a "SK_WidenSubvector" shuffle kind enum that targets could override the zero cost, but that doesn't seem necessary atm.

Differential Revision: https://reviews.llvm.org/D107228

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
    llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 13836e723583a..449eb5513399a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1077,6 +1077,10 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       int NumSubElts, SubIndex;
 
       if (Shuffle->changesLength()) {
+        // Treat a 'subvector widening' as a free shuffle.
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding())
+          return 0;
+
         if (Shuffle->isExtractSubvectorMask(SubIndex))
           return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
                                            Shuffle->getShuffleMask(), SubIndex,

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index a276051ee0d15..99467caa16ab5 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -53,8 +53,8 @@ define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out,
 }
 
 ; GCN-LABEL: 'shufflevector_xxx'
-; TPT: estimated cost of -1 for {{.*}} shufflevector <2 x i8> %vec, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CS: estimated cost of 1 for {{.*}} shufflevector <2 x i8> %vec, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; TPT: estimated cost of 0 for {{.*}} shufflevector <2 x i8> %vec, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CS: estimated cost of 0 for {{.*}} shufflevector <2 x i8> %vec, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; Should not assert
 define amdgpu_kernel void @shufflevector_xxx(<4 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr

diff  --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index 280cb4c7ee8f0..45131fb65439f 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -18,9 +18,9 @@
 
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -32,9 +32,9 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -46,9 +46,9 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -76,9 +76,9 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -90,9 +90,9 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -104,9 +104,9 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -134,12 +134,12 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
 ; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -165,12 +165,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -196,12 +196,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -227,12 +227,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -258,12 +258,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -289,12 +289,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -355,12 +355,12 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
 ; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -386,12 +386,12 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -417,12 +417,12 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -448,12 +448,12 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -479,12 +479,12 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -510,12 +510,12 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x i32> %src64, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x i32> %src64, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x i32> %src64, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x i32> %src128, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x i32> %src128, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -576,16 +576,16 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 
 define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16x i16> %src256, <32 x i16> %src512) {
 ; SSE2-LABEL: 'test_vXi16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -611,16 +611,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -646,16 +646,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -681,16 +681,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -716,16 +716,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -751,16 +751,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -786,16 +786,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -821,16 +821,16 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512VMBI-LABEL: 'test_vXi16'
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <2 x i16> %src32, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <2 x i16> %src32, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <2 x i16> %src32, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <2 x i16> %src32, <2 x i16> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <4 x i16> %src64, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <4 x i16> %src64, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <4 x i16> %src64, <4 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <8 x i16> %src128, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <8 x i16> %src128, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <16 x i16> %src256, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> %src32_64, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -896,21 +896,21 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -936,21 +936,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -976,21 +976,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1016,21 +1016,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1056,21 +1056,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1096,21 +1096,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1136,21 +1136,21 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512VMBI-LABEL: 'test_vXi8'
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_32 = shufflevector <2 x i8> %src16, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_64 = shufflevector <2 x i8> %src16, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_128 = shufflevector <2 x i8> %src16, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_256 = shufflevector <2 x i8> %src16, <2 x i8> undef, <32 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src16_512 = shufflevector <2 x i8> %src16, <2 x i8> undef, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_64 = shufflevector <4 x i8> %src32, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_128 = shufflevector <4 x i8> %src32, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_256 = shufflevector <4 x i8> %src32, <4 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src32_512 = shufflevector <4 x i8> %src32, <4 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <8 x i8> %src64, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <8 x i8> %src64, <8 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <8 x i8> %src64, <8 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <16 x i8> %src128, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <16 x i8> %src128, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <32 x i8> %src256, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32_01 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32_23 = shufflevector <4 x i8> %src32, <4 x i8> %src16_32, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX512VMBI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> %src32_64, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>


        


More information about the llvm-commits mailing list